In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

from sklearn import tree, metrics, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from collections import defaultdict
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_curve, auc, classification_report, confusion_matrix

from dmba import liftChart , gainsChart
from dmba import regressionSummary , classificationSummary, exhaustive_search, adjusted_r2_score, AIC_score, plotDecisionTree

from sklearn.neural_network import MLPClassifier

#### Dataset reading, data explaration and data cleaning

In [2]:
supply_df = pd.read_csv('DataCoSupplyChainDataset.csv', encoding='latin-1')
supply_df.head(10)

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class
5,TRANSFER,6,4,18.58,294.980011,Shipping canceled,0,73,Sporting Goods,Tonawanda,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/19/2018 11:03,Standard Class
6,DEBIT,2,1,95.18,288.420013,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 10:42,First Class
7,TRANSFER,2,1,68.43,285.140015,Late delivery,1,73,Sporting Goods,Miami,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 10:21,First Class
8,CASH,3,2,133.720001,278.589996,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 10:00,Second Class
9,CASH,2,1,132.149994,275.309998,Late delivery,1,73,Sporting Goods,San Ramon,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 9:39,First Class


In [3]:
#renames columns: replace spaces with _
supply_df.columns = [s.strip().replace(' ','_') for s in supply_df.columns]

In [4]:
supply_df.head(10)

Unnamed: 0,Type,Days_for_shipping_(real),Days_for_shipment_(scheduled),Benefit_per_order,Sales_per_customer,Delivery_Status,Late_delivery_risk,Category_Id,Category_Name,Customer_City,...,Order_Zipcode,Product_Card_Id,Product_Category_Id,Product_Description,Product_Image,Product_Name,Product_Price,Product_Status,shipping_date_(DateOrders),Shipping_Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class
5,TRANSFER,6,4,18.58,294.980011,Shipping canceled,0,73,Sporting Goods,Tonawanda,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/19/2018 11:03,Standard Class
6,DEBIT,2,1,95.18,288.420013,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 10:42,First Class
7,TRANSFER,2,1,68.43,285.140015,Late delivery,1,73,Sporting Goods,Miami,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 10:21,First Class
8,CASH,3,2,133.720001,278.589996,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 10:00,Second Class
9,CASH,2,1,132.149994,275.309998,Late delivery,1,73,Sporting Goods,San Ramon,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 9:39,First Class


In [5]:
#Remove columns based on research
drop_supply = supply_df
df_drop = drop_supply.drop(columns=['Benefit_per_order','Sales_per_customer','Category_Id','Category_Name',
                                   'Order_Zipcode','Product_Card_Id','Product_Category_Id','Product_Description',
                                   'Product_Image','Product_Name','Product_Price','shipping_date_(DateOrders)',
                                    'Customer_Email','Customer_Fname','Customer_Id','Order_Item_Profit_Ratio','Sales',
                                    'Order_Item_Total','Order_Profit_Per_Order','Order_Region','Order_State',
                                    'Order_Status','Customer_Lname','Customer_Password','Customer_Segment',
                                    'Customer_State','Customer_Street','order_date_(DateOrders)',
                                    'Order_Id','Order_Item_Cardprod_Id','Order_Item_Discount',
                                    'Order_Item_Discount_Rate','Order_Item_Id','Order_Item_Product_Price',
                                    'Order_Item_Quantity','Customer_Zipcode','Department_Id','Department_Name',
                                    'Latitude','Longitude','Market','Order_Customer_Id','Product_Status','Type',
                                    'Delivery_Status','Product_Status'])


In [6]:
df_drop.head(10)

Unnamed: 0,Days_for_shipping_(real),Days_for_shipment_(scheduled),Late_delivery_risk,Customer_City,Customer_Country,Order_City,Order_Country,Shipping_Mode
0,3,4,0,Caguas,Puerto Rico,Bekasi,Indonesia,Standard Class
1,5,4,1,Caguas,Puerto Rico,Bikaner,India,Standard Class
2,4,4,0,San Jose,EE. UU.,Bikaner,India,Standard Class
3,3,4,0,Los Angeles,EE. UU.,Townsville,Australia,Standard Class
4,2,4,0,Caguas,Puerto Rico,Townsville,Australia,Standard Class
5,6,4,0,Tonawanda,EE. UU.,Toowoomba,Australia,Standard Class
6,2,1,1,Caguas,Puerto Rico,Guangzhou,China,First Class
7,2,1,1,Miami,EE. UU.,Guangzhou,China,First Class
8,3,2,1,Caguas,Puerto Rico,Guangzhou,China,Second Class
9,2,1,1,San Ramon,EE. UU.,Guangzhou,China,First Class


In [7]:
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 8 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   Days_for_shipping_(real)       180519 non-null  int64 
 1   Days_for_shipment_(scheduled)  180519 non-null  int64 
 2   Late_delivery_risk             180519 non-null  int64 
 3   Customer_City                  180519 non-null  object
 4   Customer_Country               180519 non-null  object
 5   Order_City                     180519 non-null  object
 6   Order_Country                  180519 non-null  object
 7   Shipping_Mode                  180519 non-null  object
dtypes: int64(3), object(5)
memory usage: 11.0+ MB


In [8]:
#Encoding of columns with data type being object
le = preprocessing.LabelEncoder()
df_drop['Customer_City']  = le.fit_transform(df_drop['Customer_City'])
df_drop['Customer_Country']  = le.fit_transform(df_drop['Customer_Country'])
df_drop['Order_City']  = le.fit_transform(df_drop['Order_City'])
df_drop['Order_Country']  = le.fit_transform(df_drop['Order_Country'])
df_drop['Shipping_Mode']  = le.fit_transform(df_drop['Shipping_Mode'])

In [9]:
#Gives a summary of the data being analysed
df_drop.describe()

Unnamed: 0,Days_for_shipping_(real),Days_for_shipment_(scheduled),Late_delivery_risk,Customer_City,Customer_Country,Order_City,Order_Country,Shipping_Mode
count,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0,180519.0
mean,3.497654,2.931847,0.548291,193.986633,0.384297,1847.114099,65.145625,2.234806
std,1.623722,1.374449,0.497664,160.930467,0.48643,1004.502586,41.852842,1.10092
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,0.0,66.0,0.0,940.0,38.0,2.0
50%,3.0,4.0,1.0,98.0,0.0,1946.0,53.0,3.0
75%,5.0,4.0,1.0,324.0,1.0,2776.0,102.0,3.0
max,6.0,4.0,1.0,562.0,1.0,3596.0,163.0,3.0


In [10]:
#creates a correlation chart for the qualitative fields
supply_cor=df_drop.corr().round(2)
supply_cor

Unnamed: 0,Days_for_shipping_(real),Days_for_shipment_(scheduled),Late_delivery_risk,Customer_City,Customer_Country,Order_City,Order_Country,Shipping_Mode
Days_for_shipping_(real),1.0,0.52,0.4,-0.0,0.0,0.0,0.0,0.52
Days_for_shipment_(scheduled),0.52,1.0,-0.37,-0.01,0.01,-0.0,0.0,0.92
Late_delivery_risk,0.4,-0.37,1.0,0.01,-0.0,0.0,-0.0,-0.4
Customer_City,-0.0,-0.01,0.01,1.0,-0.59,0.0,0.0,-0.0
Customer_Country,0.0,0.01,-0.0,-0.59,1.0,0.0,-0.0,0.0
Order_City,0.0,-0.0,0.0,0.0,0.0,1.0,0.02,0.0
Order_Country,0.0,0.0,-0.0,0.0,-0.0,0.02,1.0,0.0
Shipping_Mode,0.52,0.92,-0.4,-0.0,0.0,0.0,0.0,1.0


In [11]:
df_drop

Unnamed: 0,Days_for_shipping_(real),Days_for_shipment_(scheduled),Late_delivery_risk,Customer_City,Customer_Country,Order_City,Order_Country,Shipping_Mode
0,3,4,0,66,1,331,70,3
1,5,4,1,66,1,391,69,3
2,4,4,0,452,0,391,69,3
3,3,4,0,285,0,3226,8,3
4,2,4,0,66,1,3226,8,3
...,...,...,...,...,...,...,...,...
180514,4,4,0,59,0,2922,31,3
180515,3,2,1,26,0,1362,77,2
180516,5,4,1,55,0,25,8,3
180517,3,4,0,66,1,25,8,3


In [12]:
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 8 columns):
 #   Column                         Non-Null Count   Dtype
---  ------                         --------------   -----
 0   Days_for_shipping_(real)       180519 non-null  int64
 1   Days_for_shipment_(scheduled)  180519 non-null  int64
 2   Late_delivery_risk             180519 non-null  int64
 3   Customer_City                  180519 non-null  int32
 4   Customer_Country               180519 non-null  int32
 5   Order_City                     180519 non-null  int32
 6   Order_Country                  180519 non-null  int32
 7   Shipping_Mode                  180519 non-null  int32
dtypes: int32(5), int64(3)
memory usage: 7.6 MB


### Neural Network Implementation

#### Selecting Predictors and Outcome

In [13]:
outcome = 'Late_delivery_risk'
predictors = [c for c in df_drop.columns if c != outcome]

In [14]:
#Data Partition with test size = 40%
X = df_drop[predictors]
y = df_drop[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

#### 1) NN model run using most popular, one hidden layer with 1 hidden node

In [15]:
# train neural network with 1 hidden nodes
clf = MLPClassifier(hidden_layer_sizes=(1), activation='logistic', solver='lbfgs',random_state=1,max_iter=1000)
clf.fit(train_X, train_y.values)

In [16]:
clf.predict(X)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [17]:
#NN Model Evaluation
# training performance
classificationSummary(train_y, clf.predict(train_X))

# validation performance
classificationSummary(valid_y, clf.predict(valid_X))

Confusion Matrix (Accuracy 0.5487)

       Prediction
Actual     0     1
     0     0 48884
     1     0 59427
Confusion Matrix (Accuracy 0.5477)

       Prediction
Actual     0     1
     0     0 32658
     1     0 39550


In [18]:
# Network structure

print('Intercepts')
print(clf.intercepts_)

print('Weights')
print(clf.coefs_)

# Prediction

print(pd.concat([df_drop,pd.DataFrame(clf.predict_proba(X))], axis=1))

Intercepts
[array([-0.15443927]), array([0.19528739])]
Weights
[array([[-0.08297799],
       [ 0.22032449],
       [-0.49988562],
       [-0.19766743],
       [-0.35324411],
       [-0.4076614 ],
       [-0.31373979]]), array([[-0.20646505]])]
        Days_for_shipping_(real)  Days_for_shipment_(scheduled)  \
0                              3                              4   
1                              5                              4   
2                              4                              4   
3                              3                              4   
4                              2                              4   
...                          ...                            ...   
180514                         4                              4   
180515                         3                              2   
180516                         5                              4   
180517                         3                              4   
180518             

In [19]:
clf.predict(X)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [20]:
#NN Model Evaluation
# training performance
classificationSummary(train_y, clf.predict(train_X))

# validation performance
classificationSummary(valid_y, clf.predict(valid_X))

Confusion Matrix (Accuracy 0.5487)

       Prediction
Actual     0     1
     0     0 48884
     1     0 59427
Confusion Matrix (Accuracy 0.5477)

       Prediction
Actual     0     1
     0     0 32658
     1     0 39550


#### 2) NN Model Run using -  Midway between input and output layers - 4 nodes ( 1HL + 4 nodes )

In [21]:
# train neural network with 4 hidden nodes
clf = MLPClassifier(hidden_layer_sizes=(4), activation='logistic', solver='lbfgs',random_state=1,max_iter=1000)
clf.fit(train_X, train_y.values)                                                  

In [22]:
clf.predict(X)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [23]:
#NN Model Evaluation
# training performance
classificationSummary(train_y, clf.predict(train_X))

# validation performance
classificationSummary(valid_y, clf.predict(valid_X))

Confusion Matrix (Accuracy 0.5489)

       Prediction
Actual     0     1
     0   304 48580
     1   282 59145
Confusion Matrix (Accuracy 0.5472)

       Prediction
Actual     0     1
     0   163 32495
     1   204 39346


In [24]:
# Network structure

print('Intercepts')
print(clf.intercepts_)

print('Weights')
print(clf.coefs_)

# Prediction

print(pd.concat([df_drop,pd.DataFrame(clf.predict_proba(X))], axis=1))

Intercepts
[array([-0.28141245,  0.32248063, -0.34251368, -0.06639009]), array([0.19503195])]
Weights
[array([[-0.01207954,  0.18789204, -0.42626937, -0.19190573],
       [-0.34213711, -0.34765781, -0.26754279, -0.10656548],
       [ 0.00720767,  0.03306329, -0.06849164,  0.08155661],
       [-0.25341968,  0.32245875, -0.40304458,  0.14681218],
       [-0.21381388,  0.05004637, -0.30667889, -0.31062686],
       [ 0.23248838,  0.39933333, -0.15907733,  0.24068873],
       [ 0.29008221,  0.33651878, -0.35387732, -0.37312591]]), array([[ 0.26694963],
       [ 0.00137457],
       [ 0.24278099],
       [-0.2376142 ]])]
        Days_for_shipping_(real)  Days_for_shipment_(scheduled)  \
0                              3                              4   
1                              5                              4   
2                              4                              4   
3                              3                              4   
4                              2           

In [25]:
clf.predict(X)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [26]:
#NN Model Evaluation
# training performance
classificationSummary(train_y, clf.predict(train_X))

# validation performance
classificationSummary(valid_y, clf.predict(valid_X))

Confusion Matrix (Accuracy 0.5489)

       Prediction
Actual     0     1
     0   304 48580
     1   282 59145
Confusion Matrix (Accuracy 0.5472)

       Prediction
Actual     0     1
     0   163 32495
     1   204 39346


####  3) NN model run using -  Less than 2X the input nodes  =  [ 12 < 14 (2*7 ) nodes] [ 1 HL + 12 nodes]

In [None]:
# train neural network with 12 hidden nodes
clf = MLPClassifier(hidden_layer_sizes=(12), activation='logistic', solver='lbfgs',random_state=1, max_iter=1000)
clf.fit(train_X, train_y.values)     

In [None]:
clf.predict(X)

In [None]:
#NN Model Evaluation
# training performance
classificationSummary(train_y, clf.predict(train_X))

# validation performance
classificationSummary(valid_y, clf.predict(valid_X))

In [None]:
# Network structure

print('Intercepts')
print(clf.intercepts_)

print('Weights')
print(clf.coefs_)

# Prediction

print(pd.concat([df_drop,pd.DataFrame(clf.predict_proba(X))], axis=1))

In [None]:
clf.predict(X)

In [None]:
#NN Model Evaluation
# training performance
classificationSummary(train_y, clf.predict(train_X))

# validation performance
classificationSummary(valid_y, clf.predict(valid_X))

#### 4) Hidden layer = 2/3 input nodes + output nodes = (2/3)*(7)+1 = 6 nodes [ 1HL + 6 Nodes ]

In [None]:
# train neural network with 6 hidden nodes
clf = MLPClassifier(hidden_layer_sizes=(6), activation='logistic', solver='lbfgs',random_state=1, max_iter=1000)
clf.fit(train_X, train_y.values)                                                  


In [None]:
clf.predict([[1,1,1,0,0,1,0]]).round(2)

In [None]:
#NN Model Evaluation
# training performance
classificationSummary(train_y, clf.predict(train_X))

# validation performance
classificationSummary(valid_y, clf.predict(valid_X))

In [None]:
# Network structure

print('Intercepts')
print(clf.intercepts_)

print('Weights')
print(clf.coefs_)

# Prediction

print(pd.concat([df_drop,pd.DataFrame(clf.predict_proba(X))], axis=1))

In [None]:
clf.predict([[1,0,1,0,0,1,0]]).round(2)

In [None]:
#NN Model Evaluation
# training performance
classificationSummary(train_y, clf.predict(train_X))

# validation performance
classificationSummary(valid_y, clf.predict(valid_X))

#### Conclusion : Highest accuracy results (out of these 4 NN model runs using different hidden nodes) are obtained with 14 hidden nodes (3rd run - hidden layers = Less than 2X the input nodes) layers and 6 hidden nodes (4th run - hidden layers = 2/3 input nodes + output nodes) [ Both the results are same]