In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder


In [2]:
# Load the dataset
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
65747,180309,CASH,2,1,165.990005,331.980011,Late delivery,1,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/28/2016 19:25,First Class
65748,180407,PAYMENT,4,4,-24.190001,383.980011,Shipping on time,0,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/26/2016 6:49,Standard Class
65749,180427,CASH,4,2,68.040001,377.980011,Late delivery,1,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/25/2016 2:47,Second Class
65750,180444,DEBIT,3,4,148.190002,379.980011,Advance shipping,0,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/23/2016 7:10,Standard Class
65751,180487,TRANSFER,4,4,-59.84,339.980011,Shipping canceled,0,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/21/2016 5:56,Standard Class


In [5]:
print(df.columns)

Index(['Unnamed: 0', 'Type', 'Days for shipping (real)',
       'Days for shipment (scheduled)', 'Benefit per order',
       'Sales per customer', 'Delivery Status', 'Late_delivery_risk',
       'Category Id', 'Category Name', 'Department city', 'Department country',
       'Customer Email', 'Customer Fname', 'Customer Id', 'Customer Lname',
       'Customer Password', 'Customer Segment', 'Department state',
       'Customer Street', 'Customer Zipcode', 'Department Id',
       'Department Name', 'Latitude', 'Longitude', 'Market', 'Order City',
       'Order Country', 'Order Customer Id', 'order date (DateOrders)',
       'Order Id', 'Order Item Cardprod Id', 'Order Item Discount',
       'Order Item Discount Rate', 'Order Item Id', 'Order Item Product Price',
       'Order Item Profit Ratio', 'Order Item Quantity', 'Sales',
       'Order Item Total', 'Order Profit Per Order', 'Order Region',
       'Order State', 'Order Status', 'Order Zipcode', 'Product Card Id',
       'Product Categ

In [6]:
columns_with_null = df.columns[df.isnull().any()].tolist()

print("Columns with null values:", columns_with_null)

Columns with null values: ['Customer Lname', 'Customer Zipcode', 'Order Zipcode', 'Product Description']


In [7]:
df['Customer Zipcode'].fillna(df['Customer Zipcode'].mean(), inplace=True)
df['Order Zipcode'].fillna(df['Order Zipcode'].mean(), inplace=True)

In [8]:
df['Customer Lname'].fillna('Unknown', inplace=True)
df['Product Description'].fillna('No Description', inplace=True)

In [9]:
columns_with_null = df.columns[df.isnull().any()].tolist()

print("Columns with null values:", columns_with_null)

Columns with null values: []


In [10]:
label_encoder = LabelEncoder()
df['Product_name_encoded'] = label_encoder.fit_transform(df['Product Name'])
df['Category_id_encoded'] = label_encoder.fit_transform(df['Category Id'])
df['Category_name_encoded'] = label_encoder.fit_transform(df['Category Name'])
df['Department_name_encoded']= label_encoder.fit_transform(df['Department Name'])

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,...,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode,Product_name_encoded,Category_id_encoded,Category_name_encoded,Department_name_encoded
0,0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,...,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class,78,47,40,4
1,1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,...,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class,78,47,40,4
2,2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,...,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class,78,47,40,4
3,3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,...,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class,78,47,40,4
4,4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,...,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class,78,47,40,4


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65752 entries, 0 to 65751
Data columns (total 58 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     65752 non-null  int64  
 1   Type                           65752 non-null  int64  
 2   Days for shipping (real)       65752 non-null  int64  
 3   Days for shipment (scheduled)  65752 non-null  int64  
 4   Benefit per order              65752 non-null  float64
 5   Sales per customer             65752 non-null  float64
 6   Delivery Status                65752 non-null  int64  
 7   Late_delivery_risk             65752 non-null  int64  
 8   Category Id                    65752 non-null  int64  
 9   Category Name                  65752 non-null  int64  
 10  Department city                65752 non-null  int64  
 11  Department country             65752 non-null  int64  
 12  Customer Email                 65752 non-null 

In [12]:
non_numeric_columns = df.select_dtypes(include=['object']).columns.tolist()
print(non_numeric_columns)


['Type', 'Delivery Status', 'Category Name', 'Department city', 'Department country', 'Customer Email', 'Customer Fname', 'Customer Lname', 'Customer Password', 'Customer Segment', 'Department state', 'Customer Street', 'Department Name', 'Market', 'Order City', 'Order Country', 'order date (DateOrders)', 'Order Region', 'Order State', 'Order Status', 'Product Description', 'Product Image', 'Product Name', 'shipping date (DateOrders)', 'Shipping Mode']


In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in non_numeric_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [14]:
print(df.select_dtypes(exclude=['number']).columns.tolist())

[]


In [15]:
df_without_department = df

In [16]:
df_without_department = df.drop('Department Name', axis=1)

In [17]:
print(df_without_department.columns)

Index(['Unnamed: 0', 'Type', 'Days for shipping (real)',
       'Days for shipment (scheduled)', 'Benefit per order',
       'Sales per customer', 'Delivery Status', 'Late_delivery_risk',
       'Category Id', 'Category Name', 'Department city', 'Department country',
       'Customer Email', 'Customer Fname', 'Customer Id', 'Customer Lname',
       'Customer Password', 'Customer Segment', 'Department state',
       'Customer Street', 'Customer Zipcode', 'Department Id', 'Latitude',
       'Longitude', 'Market', 'Order City', 'Order Country',
       'Order Customer Id', 'order date (DateOrders)', 'Order Id',
       'Order Item Cardprod Id', 'Order Item Discount',
       'Order Item Discount Rate', 'Order Item Id', 'Order Item Product Price',
       'Order Item Profit Ratio', 'Order Item Quantity', 'Sales',
       'Order Item Total', 'Order Profit Per Order', 'Order Region',
       'Order State', 'Order Status', 'Order Zipcode', 'Product Card Id',
       'Product Category Id', 'Product D

In [18]:
num_columns_without_dep= len(df_without_department.columns)
print(num_columns_without_dep)

num_columns_with_dep = len(df.columns)
print(num_columns_with_dep)

57
58


In [19]:
column_name = 'Department Name'
one_with_dep_df = df[[column_name]].copy()  # Using copy() to avoid potential SettingWithCopyWarning


In [20]:
X = df_without_department # Features
y = one_with_dep_df  # Target


In [21]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
# Initialize and train the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


  return fit_method(estimator, *args, **kwargs)


In [23]:
# Make predictions
y_pred = clf.predict(X_test)


In [24]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4252
           1       1.00      1.00      1.00        73
           2       1.00      1.00      1.00       399
           3       1.00      1.00      1.00      4170
           4       1.00      1.00      1.00       305
           5       1.00      1.00      1.00       987
           6       1.00      1.00      1.00      1666
           7       1.00      1.00      1.00        78
           8       1.00      1.00      1.00       839
           9       1.00      1.00      1.00        93
          10       1.00      1.00      1.00       289

    accuracy                           1.00     13151
   macro avg       1.00      1.00      1.00     13151
weighted avg       1.00      1.00      1.00     13151

