In [1]:
# Importing Data Manipulation Libraries
import pandas as pd
import numpy as np
# Import Data Visualization Libraries
import seaborn as sns 
import matplotlib.pyplot as plt 
# Import Filter Warning Libraries
import warnings
warnings.filterwarnings('ignore')
# Import Logging
import logging
logging.basicConfig(level = logging.INFO,
                    format = '%(asctime)s - %(levelname)s - %(message)s',
                    filemode = 'w',
                    filename = 'model.log',force = True)
# Import Scikit Learn Libraries for Machine Learning Model Building
from sklearn.preprocessing import MinMaxScaler,RobustScaler,StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,learning_curve,KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LogisticRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.cluster import KMeans


# Multicolinearity test and treatment libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA

In [2]:
# Uploading Dataset Using Pandas Function
# Url Taken from Github 
url = 'https://raw.githubusercontent.com/mukeshmagar543/Smart-Logistics-Supply-Chain-Dataset/refs/heads/main/smart_logistics_dataset.csv'

df = pd.read_csv(url)

df.sample(frac = 1)  # Shuffle Dataset

Unnamed: 0,Timestamp,Asset_ID,Latitude,Longitude,Inventory_Level,Shipment_Status,Temperature,Humidity,Traffic_Status,Waiting_Time,User_Transaction_Amount,User_Purchase_Frequency,Logistics_Delay_Reason,Asset_Utilization,Demand_Forecast,Logistics_Delay
943,2024-11-20 18:45:34,Truck_1,-13.2882,-171.9473,254,Delivered,27.3,71.7,Detour,38,279,10,Traffic,91.5,111,0
46,2024-09-05 14:12:00,Truck_3,20.8843,1.3947,303,Delayed,20.7,70.6,Clear,47,143,10,,98.1,123,1
661,2024-02-28 00:53:35,Truck_6,-86.1770,112.1068,118,In Transit,25.1,59.5,Clear,36,200,1,,80.6,239,0
244,2024-05-27 05:13:33,Truck_2,-9.2287,107.9306,329,Delivered,25.0,51.2,Detour,53,233,5,Mechanical Failure,67.2,188,0
255,2024-08-02 06:58:21,Truck_8,67.7002,43.8530,133,In Transit,28.7,69.5,Detour,57,342,6,Traffic,99.9,261,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340,2024-01-17 03:33:56,Truck_10,-12.9902,176.1206,399,Delivered,23.4,71.7,Heavy,12,239,1,,67.6,156,1
954,2024-04-05 13:50:11,Truck_7,-9.6291,39.6506,495,Delivered,25.1,75.5,Detour,39,234,1,,74.8,203,0
119,2024-06-11 08:04:09,Truck_4,-63.2064,30.0818,118,Delayed,21.8,52.1,Clear,19,421,8,Mechanical Failure,99.1,284,1
651,2024-05-11 12:13:27,Truck_5,20.6546,79.9499,194,In Transit,22.9,69.7,Clear,11,140,4,Weather,84.7,293,0


In [None]:
# Checking Dataset Information
df.info()

In [58]:
from collections import OrderedDict

stats = []

# Descriptive statistics
for col in df.columns:
    if df[col].dtype != 'object':
        numerical_stats = OrderedDict({
            'Feature': col,
            'Minimum': df[col].min(),
            'Maximum': df[col].max(),
            'Mean': df[col].mean(),
            'Mode': df[col].mode()[0] if not df[col].mode().empty else None,
            '25%': df[col].quantile(0.25),
            '75%': df[col].quantile(0.75),
            'IQR': df[col].quantile(0.75) - df[col].quantile(0.25),
            'Standard Deviation': df[col].std(),
            'Skewness': df[col].skew(),
            'Kurtosis': df[col].kurt()
        })
        stats.append(numerical_stats)

# Convert to DataFrame
report = pd.DataFrame(stats)

# Outlier Identification :
outlier_label = []
for col in report['Feature']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    LW = Q1 - 1.5 * IQR   # LW : Lower Whisker Line
    UW = Q3 + 1.5 * IQR   # UW : Upper Whisker Line
    outliers = df[(df[col] < LW) | (df[col] > UW)]
    if not outliers.empty:
        outlier_label.append("Has Outliers")
    else:
        outlier_label.append("No Outliers")

report["Outlier Comment"] = outlier_label

# Checking Report
report

Unnamed: 0,Feature,Minimum,Maximum,Mean,Mode,25%,75%,IQR,Standard Deviation,Skewness,Kurtosis,Outlier Comment
0,Timestamp,0.0,999.0,499.5,0.0,249.75,749.25,499.5,288.819436,0.0,-1.2,No Outliers
1,Asset_ID,0.0,9.0,4.537,8.0,2.0,7.0,5.0,2.849342,-0.012491,-1.233406,No Outliers
2,Latitude,-89.7915,89.8701,-1.360093,-89.7915,-46.167975,44.5028,90.670775,51.997183,0.033982,-1.175119,No Outliers
3,Longitude,-179.8202,179.9237,0.837049,-179.8202,-88.448075,88.15645,176.604525,104.843618,-0.001975,-1.197892,No Outliers
4,Inventory_Level,100.0,500.0,297.915,403.0,201.0,399.0,198.0,113.554773,0.015787,-1.18372,No Outliers
5,Shipment_Status,0.0,2.0,0.962,0.0,0.0,2.0,2.0,0.813153,0.069691,-1.484339,No Outliers
6,Temperature,18.0,30.0,23.8939,22.8,21.2,26.6,5.4,3.322178,0.015239,-1.068699,No Outliers
7,Humidity,50.0,80.0,65.0422,74.2,57.2,72.4,15.2,8.753765,-0.054588,-1.201709,No Outliers
8,Traffic_Status,0.0,2.0,0.999,1.0,0.0,2.0,2.0,0.809725,0.001823,-1.474647,No Outliers
9,Waiting_Time,10.0,60.0,35.062,24.0,23.0,49.0,26.0,14.477768,0.006116,-1.207727,No Outliers


In [None]:
# Split the Dataset into Numerical_Data and Categorical_Data
Numerical_Data = df.select_dtypes(exclude = 'object')

Categorical_Data = df.select_dtypes(include = 'object')

In [None]:
Numerical_Data

In [None]:
Categorical_Data

In [None]:
# Checking Null Values
df.isnull().sum()

In [3]:
df.drop(['Logistics_Delay_Reason'], axis=1, inplace= True)

In [4]:
# Using Label Encoding Technique
from sklearn.preprocessing import LabelEncoder
df['Timestamp'] = LabelEncoder().fit_transform(df['Timestamp'])
df['Asset_ID'] = LabelEncoder().fit_transform(df['Asset_ID'])
df['Shipment_Status'] = LabelEncoder().fit_transform(df['Shipment_Status'])
df['Traffic_Status'] = LabelEncoder().fit_transform(df['Traffic_Status'])
df

Unnamed: 0,Timestamp,Asset_ID,Latitude,Longitude,Inventory_Level,Shipment_Status,Temperature,Humidity,Traffic_Status,Waiting_Time,User_Transaction_Amount,User_Purchase_Frequency,Asset_Utilization,Demand_Forecast,Logistics_Delay
0,224,7,-65.7383,11.2497,390,0,27.0,67.8,1,38,320,4,60.1,285,1
1,824,6,22.2748,-131.7086,491,2,22.5,54.3,2,16,439,7,80.9,174,1
2,577,1,54.9232,79.5455,190,2,25.2,62.2,1,34,355,3,99.2,260,0
3,819,9,42.3900,-1.4788,330,1,25.4,52.3,2,37,227,5,97.4,160,1
4,734,7,-65.8477,47.9468,480,0,20.5,57.2,0,56,197,6,71.6,270,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,556,6,89.8701,73.6867,264,1,26.9,70.0,2,32,188,1,79.2,213,1
996,342,5,-10.4792,-177.1239,479,1,23.7,77.9,1,56,276,7,83.7,272,0
997,818,2,-71.0609,75.3714,347,2,21.0,63.1,1,35,382,5,74.8,275,0
998,306,2,-76.7910,18.3631,276,1,18.0,64.3,2,10,361,5,88.6,242,1


In [None]:
# Checking Output  i.e. Target Column for Data Distribution
df['Logistics_Delay'].value_counts()

In [None]:
# Set figure size
plt.figure(figsize=(15, 8))

# Create boxplot for all numerical columns
sns.boxplot(data=df, orient='h', palette='Set2')

# Set title
plt.title('Boxplot After Outlier Treatment')
plt.tight_layout()
plt.show()

In [5]:
df.corr()['Logistics_Delay']

Timestamp                 -0.012448
Asset_ID                  -0.010586
Latitude                   0.039634
Longitude                  0.011796
Inventory_Level           -0.003340
Shipment_Status           -0.572200
Temperature               -0.038067
Humidity                  -0.000504
Traffic_Status             0.529932
Waiting_Time              -0.063036
User_Transaction_Amount    0.013605
User_Purchase_Frequency   -0.017439
Asset_Utilization         -0.003140
Demand_Forecast           -0.018621
Logistics_Delay            1.000000
Name: Logistics_Delay, dtype: float64

In [6]:
# Checking VIF:
def calculate_vif(dataset):
    vif = pd.DataFrame()
    vif['features'] = dataset.columns
    vif['VIF_Values'] = [variance_inflation_factor(dataset.values,i) for i in range(dataset.shape[1])]
    vif['VIF_Values'] = round(vif['VIF_Values'], 2)
    vif = vif.sort_values(by = 'VIF_Values', ascending=False)
    return (vif)

calculate_vif(df.drop('Logistics_Delay',axis = 1))

Unnamed: 0,features,VIF_Values
6,Temperature,39.6
7,Humidity,39.56
12,Asset_Utilization,37.58
13,Demand_Forecast,11.34
4,Inventory_Level,7.66
10,User_Transaction_Amount,7.28
9,Waiting_Time,6.77
11,User_Purchase_Frequency,4.53
0,Timestamp,4.01
1,Asset_ID,3.55


In [54]:
# Model Building
# Step 1 : Divide Dataset into x and y

# X : Independant Column and y : Target Column

X = df.drop(['Logistics_Delay', 'Temperature', 'Humidity', 'Asset_Utilization'], axis = 1)  # Independant Column : 11

y =df['Logistics_Delay']  # Target Column : 1

# Step 2 : Split the Dataset into Train and Test
# 100% -----> 80% Data is used for Training [Seen Data] and 20% Data [Unseen Data] is used for Testing
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 7)

# Step 3 : Normalization Scaling Technique
# Standard Scaler ----> When Dataset is Normally Distributed
# MinMax Scalar ----> When Dataset having Minor Outliers
# Roubst Scalar ----> When Dataset is Non Normal Distributed and having lots of Outliers
from sklearn.preprocessing import MinMaxScaler

scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)


# Step 4 : Using SMOTE Technique to Balance Target Column

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train,y_train = sm.fit_resample(X_train,y_train)

In [55]:
LR = LogisticRegression()
# Fit the Model on Training Dataset
LR.fit(X_train,y_train)

# Predict the Value Based on Test Dataset
y_pred = LR.predict(X_test)

# Step 6: Use Matrics to evaluate Model Performances
from sklearn.metrics import accuracy_score,classification_report

accuracy_score_LR = accuracy_score(y_test,y_pred)
accuracy_score_LR

0.86

In [56]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LR, X_train, y_train, cv= 10, scoring= 'accuracy')

print('Accuracy for each fold : ', scores)
print('Average Accuracy across 10 folds : ', np.mean(scores))

Accuracy for each fold :  [0.825      0.8375     0.8625     0.8625     0.86075949 0.82278481
 0.83544304 0.88607595 0.91139241 0.92405063]
Average Accuracy across 10 folds :  0.8628006329113923
