In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder


In [2]:
# Load the dataset
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
65747,180309,CASH,2,1,165.990005,331.980011,Late delivery,1,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/28/2016 19:25,First Class
65748,180407,PAYMENT,4,4,-24.190001,383.980011,Shipping on time,0,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/26/2016 6:49,Standard Class
65749,180427,CASH,4,2,68.040001,377.980011,Late delivery,1,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/25/2016 2:47,Second Class
65750,180444,DEBIT,3,4,148.190002,379.980011,Advance shipping,0,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/23/2016 7:10,Standard Class
65751,180487,TRANSFER,4,4,-59.84,339.980011,Shipping canceled,0,45,Fishing,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/21/2016 5:56,Standard Class


In [5]:
print(df.columns)

Index(['Unnamed: 0', 'Type', 'Days for shipping (real)',
       'Days for shipment (scheduled)', 'Benefit per order',
       'Sales per customer', 'Delivery Status', 'Late_delivery_risk',
       'Category Id', 'Category Name', 'Department city', 'Department country',
       'Customer Email', 'Customer Fname', 'Customer Id', 'Customer Lname',
       'Customer Password', 'Customer Segment', 'Department state',
       'Customer Street', 'Customer Zipcode', 'Department Id',
       'Department Name', 'Latitude', 'Longitude', 'Market', 'Order City',
       'Order Country', 'Order Customer Id', 'order date (DateOrders)',
       'Order Id', 'Order Item Cardprod Id', 'Order Item Discount',
       'Order Item Discount Rate', 'Order Item Id', 'Order Item Product Price',
       'Order Item Profit Ratio', 'Order Item Quantity', 'Sales',
       'Order Item Total', 'Order Profit Per Order', 'Order Region',
       'Order State', 'Order Status', 'Order Zipcode', 'Product Card Id',
       'Product Categ

In [6]:
columns_with_null = df.columns[df.isnull().any()].tolist()

print("Columns with null values:", columns_with_null)

Columns with null values: ['Customer Lname', 'Customer Zipcode', 'Order Zipcode', 'Product Description']


In [7]:
df['Customer Zipcode'].fillna(df['Customer Zipcode'].mean(), inplace=True)
df['Order Zipcode'].fillna(df['Order Zipcode'].mean(), inplace=True)

In [8]:
df['Customer Lname'].fillna('Unknown', inplace=True)
df['Product Description'].fillna('No Description', inplace=True)

In [9]:
columns_with_null = df.columns[df.isnull().any()].tolist()

print("Columns with null values:", columns_with_null)

Columns with null values: []


In [10]:
print(df['Order Item Quantity'])

0        1
1        1
2        1
3        1
4        1
        ..
65747    1
65748    1
65749    1
65750    1
65751    1
Name: Order Item Quantity, Length: 65752, dtype: int64


In [11]:
df['Order Item Quantity'] = df['Order Item Quantity'] * 100

In [12]:
print(df['Order Item Quantity'])

0        100
1        100
2        100
3        100
4        100
        ... 
65747    100
65748    100
65749    100
65750    100
65751    100
Name: Order Item Quantity, Length: 65752, dtype: int64


In [13]:
non_numeric_columns = df.select_dtypes(include=['object']).columns.tolist()
print(non_numeric_columns)

['Type', 'Delivery Status', 'Category Name', 'Department city', 'Department country', 'Customer Email', 'Customer Fname', 'Customer Lname', 'Customer Password', 'Customer Segment', 'Department state', 'Customer Street', 'Department Name', 'Market', 'Order City', 'Order Country', 'order date (DateOrders)', 'Order Region', 'Order State', 'Order Status', 'Product Description', 'Product Image', 'Product Name', 'shipping date (DateOrders)', 'Shipping Mode']


In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in non_numeric_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [15]:
print(df.select_dtypes(exclude=['number']).columns.tolist())

[]


In [16]:
column_name = 'Department Id'
one_with_depid_df = df[[column_name]].copy()

In [17]:
column_names = ['Category Name', 'Customer Zipcode', 'Customer Street', 'Customer Id', 'Customer Segment','Order City', 'Order Country', 'Order Item Quantity','Order Customer Id', 'order date (DateOrders)', 'Order Id', 'Order Item Cardprod Id', 'Order Region', 'Order State', 'Product Card Id', 'Product Name', 'Product Price']

feature_df = df[column_names].copy()


In [18]:
one_with_depid_df.head()

Unnamed: 0,Department Id
0,2
1,2
2,2
3,2
4,2


In [19]:
one_with_depid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65752 entries, 0 to 65751
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Department Id  65752 non-null  int64
dtypes: int64(1)
memory usage: 513.8 KB


In [20]:
feature_df.head()

Unnamed: 0,Category Name,Customer Zipcode,Customer Street,Customer Id,Customer Segment,Order City,Order Country,Order Item Quantity,Order Customer Id,order date (DateOrders),Order Id,Order Item Cardprod Id,Order Region,Order State,Product Card Id,Product Name,Product Price
0,40,725.0,3683,20755,0,331,70,100,20755,5961,77202,1360,15,475,1360,78,327.75
1,40,725.0,1400,19492,0,391,69,100,19492,1147,75939,1360,13,841,1360,78,327.75
2,40,95125.0,6217,19491,0,391,69,100,19491,1146,75938,1360,13,841,1360,78,327.75
3,40,90027.0,1803,19490,2,3226,8,100,19490,1145,75937,1360,11,835,1360,78,327.75
4,40,725.0,6345,19489,1,3226,8,100,19489,1144,75936,1360,11,835,1360,78,327.75


In [21]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65752 entries, 0 to 65751
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Category Name            65752 non-null  int64  
 1   Customer Zipcode         65752 non-null  float64
 2   Customer Street          65752 non-null  int64  
 3   Customer Id              65752 non-null  int64  
 4   Customer Segment         65752 non-null  int64  
 5   Order City               65752 non-null  int64  
 6   Order Country            65752 non-null  int64  
 7   Order Item Quantity      65752 non-null  int64  
 8   Order Customer Id        65752 non-null  int64  
 9   order date (DateOrders)  65752 non-null  int64  
 10  Order Id                 65752 non-null  int64  
 11  Order Item Cardprod Id   65752 non-null  int64  
 12  Order Region             65752 non-null  int64  
 13  Order State              65752 non-null  int64  
 14  Product Card Id       

In [22]:
num_columns_target= len(one_with_depid_df.columns)
print(num_columns_target)

num_columns_feature = len(feature_df.columns)
print(num_columns_feature)

1
17


In [23]:
X = feature_df # Features
y = one_with_depid_df  # Target


In [24]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report



# Random Forest Classification
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)



  return fit_method(estimator, *args, **kwargs)


In [26]:

# Linear Regression
linear_classifier = LogisticRegression()
linear_classifier.fit(X_train, y_train)
y_pred_linear = linear_classifier.predict(X_test)



  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Support Vector Classification
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)



  y = column_or_1d(y, warn=True)


In [28]:

# Evaluate the random forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)





In [29]:
# Evaluate the linear regression model
accuracy_linear = accuracy_score(y_test, y_pred_linear)
classification_rep_linear = classification_report(y_test, y_pred_linear)



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# Evaluate the svm model

accuracy_svm = accuracy_score(y_test, y_pred_svm)
classification_rep_svm = classification_report(y_test, y_pred_svm)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Classification Report:\n", classification_rep_rf)

print("Linear Regression Accuracy:", accuracy_linear)
print("Linear Regression Classification Report:\n", classification_rep_linear)

print("Support Vector Machine Accuracy:", accuracy_svm)
print("Support Vector Machine Classification Report:\n", classification_rep_svm)


Random Forest Accuracy: 1.0
Random Forest Classification Report:
               precision    recall  f1-score   support

           2       1.00      1.00      1.00       305
           3       1.00      1.00      1.00       987
           4       1.00      1.00      1.00      4252
           5       1.00      1.00      1.00      1666
           6       1.00      1.00      1.00       839
           7       1.00      1.00      1.00      4170
           8       1.00      1.00      1.00        73
           9       1.00      1.00      1.00       399
          10       1.00      1.00      1.00       289
          11       1.00      1.00      1.00        93
          12       1.00      1.00      1.00        78

    accuracy                           1.00     13151
   macro avg       1.00      1.00      1.00     13151
weighted avg       1.00      1.00      1.00     13151

Linear Regression Accuracy: 0.5776747015436089
Linear Regression Classification Report:
               precision    recal

In [32]:
# SVMClassifier was the best-performing model based on classification metrics so we took that as best_classifier
best_classifier = svm_classifier 

# Extract the relevant features used during training
training_features = X.columns.tolist()

# Ensure the prediction dataset has the same features (columns)
prediction_features = df[training_features]

# Make predictions on the entire dataset or new data
df['Predicted_Department'] = best_classifier.predict(prediction_features)

# Display the predicted results
predicted_df = df[['Product Card Id', 'Predicted_Department']]
print(predicted_df.head())
print(predicted_df.tail())

   Product Card Id  Predicted_Department
0             1360                     9
1             1360                     9
2             1360                     9
3             1360                     9
4             1360                     9
       Product Card Id  Predicted_Department
65747             1004                     7
65748             1004                     7
65749             1004                     7
65750             1004                     7
65751             1004                     7


In [33]:
import pickle
# Specify the file path
file_path = 'model_svm_final.pkl'

# Open the file in binary write mode
with open(file_path, 'wb') as file:
    # Write the model to the file
    pickle.dump(svm_classifier, file)
    
    
    

In [None]:
import sys
sys.version