In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [2]:

# Load the dataset
data = pd.read_csv('cleaned_data.csv')

# # Drop unnecessary columns for simplicity
# cols_to_drop = ['Customer Email', 'Customer Password', 'Order Id', 'Order Item Id', 'Product Card Id', 'Product Image', 'Product Description']
# data.drop(cols_to_drop, axis=1, inplace=True)



In [3]:
# Handle missing values (for simplicity, let's drop them)
#data.dropna(inplace=True)


In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


In [5]:
print(data['Department Name'].head())

0    Fitness
1    Fitness
2    Fitness
3    Fitness
4    Fitness
Name: Department Name, dtype: object


In [6]:
# One-hot encode categorical variables
data_encoded = pd.get_dummies(data)




In [7]:
data_encoded.head()

Unnamed: 0.1,Unnamed: 0,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Late_delivery_risk,Category Id,Customer Id,Customer Zipcode,Department Id,...,shipping date (DateOrders)_9/9/2017 6:20,shipping date (DateOrders)_9/9/2017 6:41,shipping date (DateOrders)_9/9/2017 7:02,shipping date (DateOrders)_9/9/2017 7:23,shipping date (DateOrders)_9/9/2017 8:47,shipping date (DateOrders)_9/9/2017 9:08,Shipping Mode_First Class,Shipping Mode_Same Day,Shipping Mode_Second Class,Shipping Mode_Standard Class
0,0,3,4,91.25,314.640015,0,73,20755,725.0,2,...,False,False,False,False,False,False,False,False,False,True
1,1,5,4,-249.089996,311.359985,1,73,19492,725.0,2,...,False,False,False,False,False,False,False,False,False,True
2,2,4,4,-247.779999,309.720001,0,73,19491,95125.0,2,...,False,False,False,False,False,False,False,False,False,True
3,3,3,4,22.860001,304.809998,0,73,19490,90027.0,2,...,False,False,False,False,False,False,False,False,False,True
4,4,2,4,134.210007,298.25,0,73,19489,725.0,2,...,False,False,False,False,False,False,False,False,False,True


In [9]:
original_columns = data.columns.tolist()

In [10]:
encoded_columns = data_encoded.columns.tolist()

In [12]:
one_hot_encoded_columns = [col for col in encoded_columns if col not in original_columns]

print("One-Hot Encoded Columns:")
for col in one_hot_encoded_columns:
    print(col)


One-Hot Encoded Columns:
Type_CASH
Type_DEBIT
Type_PAYMENT
Type_TRANSFER
Delivery Status_Advance shipping
Delivery Status_Late delivery
Delivery Status_Shipping canceled
Delivery Status_Shipping on time
Category Name_Accessories
Category Name_As Seen on  TV!
Category Name_Baby 
Category Name_Baseball & Softball
Category Name_Basketball
Category Name_Books 
Category Name_Boxing & MMA
Category Name_CDs 
Category Name_Cameras 
Category Name_Camping & Hiking
Category Name_Cardio Equipment
Category Name_Children's Clothing
Category Name_Cleats
Category Name_Computers
Category Name_Consumer Electronics
Category Name_Crafts
Category Name_DVDs
Category Name_Electronics
Category Name_Fishing
Category Name_Fitness Accessories
Category Name_Garden
Category Name_Girls' Apparel
Category Name_Golf Apparel
Category Name_Golf Bags & Carts
Category Name_Golf Balls
Category Name_Golf Gloves
Category Name_Golf Shoes
Category Name_Health and Beauty
Category Name_Hockey
Category Name_Hunting & Shooting
Cat

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [13]:
# List columns that start with 'Department '
department_columns = [col for col in data_encoded.columns if col.startswith('Department ')]
print(department_columns)


['Department Id', 'Department city_Aguadilla', 'Department city_Alameda', 'Department city_Albany', 'Department city_Albuquerque', 'Department city_Algonquin', 'Department city_Alhambra', 'Department city_Allentown', 'Department city_Alpharetta', 'Department city_Amarillo', 'Department city_Anaheim', 'Department city_Ann Arbor', 'Department city_Annandale', 'Department city_Annapolis', 'Department city_Antioch', 'Department city_Apex', 'Department city_Apopka', 'Department city_Arecibo', 'Department city_Arlington', 'Department city_Arlington Heights', 'Department city_Asheboro', 'Department city_Astoria', 'Department city_Atlanta', 'Department city_Augusta', 'Department city_Aurora', 'Department city_Austin', 'Department city_Azusa', 'Department city_Bakersfield', 'Department city_Baldwin Park', 'Department city_Ballwin', 'Department city_Baltimore', 'Department city_Bartlett', 'Department city_Bay Shore', 'Department city_Bayamon', 'Department city_Bayonne', 'Department city_Baytown'

In [24]:
department_columns = [col for col in data_encoded.columns if col.startswith('Department Name_')]
print(department_columns)


['Department Name_Apparel', 'Department Name_Book Shop', 'Department Name_Discs Shop', 'Department Name_Fan Shop', 'Department Name_Fitness', 'Department Name_Footwear', 'Department Name_Golf', 'Department Name_Health and Beauty ', 'Department Name_Outdoors', 'Department Name_Pet Shop', 'Department Name_Technology']


In [16]:
department_columns = [col for col in data_encoded.columns if col.startswith('Department Name_')]


In [17]:
X = data_encoded.drop(columns=department_columns)


In [20]:
print(data_encoded.head())


   Unnamed: 0  Days for shipping (real)  Days for shipment (scheduled)  \
0           0                         3                              4   
1           1                         5                              4   
2           2                         4                              4   
3           3                         3                              4   
4           4                         2                              4   

   Benefit per order  Sales per customer  Late_delivery_risk  Category Id  \
0          91.250000          314.640015                   0           73   
1        -249.089996          311.359985                   1           73   
2        -247.779999          309.720001                   0           73   
3          22.860001          304.809998                   0           73   
4         134.210007          298.250000                   0           73   

   Customer Id  Customer Zipcode  Department Id  ...  \
0        20755             725.0    

In [27]:
print(data_encoded.columns.tolist())


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [26]:
# Step 1: Create a list of 'Department Name_' columns
department_columns = [col for col in data_encoded.columns if col.startswith('Department Name_')]

# Step 2: Drop the 'Department Name_' columns from features (X)
X = data_encoded.drop(columns=department_columns)

# Step 3: Define the target (y) as 'Department Name'
y = data_encoded['Department Name']


KeyError: 'Department Name'

In [None]:
# # Define features and target
# X = data_encoded.drop('Department Name', axis=1)  # Features
# y = data_encoded['Department Name']  # Target



In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
