In [1]:
# Import required libraries
import pandas as pd
import numpy as np

In [2]:
# Load the original dataset
file_path = "D:/NOTEBOOK/e-commerce-scrubbed-data.xlsx"
df = pd.read_excel(file_path, sheet_name="Scrubbed Data")

In [3]:
# 1. Filter status by 'complete' and 'received'
df = df[df['status'].isin(['complete', 'received'])]

# 2. Filter qty_ordered < 100
df = df[df['qty_ordered'] < 100]

# 3. Keep top 3 categories
# First, get the top 3 categories by total quantity
top_3_categories = (
    df.groupby('category_name_1')['qty_ordered']
    .sum()
    .sort_values(ascending=False)
    .head(3)
    .index
)

# Filter the dataframe to keep only top 3 categories
df = df[df['category_name_1'].isin(top_3_categories)]

In [4]:
# Print some information about the filtered dataset
print("Dataset Information after filtering:")
print(f"Total number of rows: {len(df)}")
print(f"Number of unique categories: {df['category_name_1'].nunique()}")
print("\nCategories included:")
print(top_3_categories.tolist())
print("\nStatus distribution:")
print(df['status'].value_counts())
print("\nQuantity ordered distribution:")
print(df['qty_ordered'].describe())

# Save the filtered dataset to a new Excel file
output_path = "D:/NOTEBOOK/e-commerce-scrubbed-data-filtered.xlsx"
df.to_excel(output_path, sheet_name="Filtered Data", index=False)
print(f"\nFiltered dataset saved to: {output_path}")

Dataset Information after filtering:
Total number of rows: 130653
Number of unique categories: 3

Categories included:
["Men's Fashion", 'Mobiles & Tablets', 'Superstore']

Status distribution:
status
complete    100118
received     30535
Name: count, dtype: int64

Quantity ordered distribution:
count    130653.000000
mean          1.233014
std           0.974926
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          72.000000
Name: qty_ordered, dtype: float64

Filtered dataset saved to: D:/NOTEBOOK/e-commerce-scrubbed-data-filtered.xlsx
