In [25]:
import pandas as pd

In [26]:
# READ THE DATA
raw_data_df = pd.read_csv("logistic_optimization/ml/data/raw/data.csv", index_col=0)
raw_data_df.head()

Unnamed: 0_level_0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,D,Flight,4,2,177,3,low,F,44,1233,1
2,F,Flight,4,5,216,2,low,M,59,3088,1
3,A,Flight,2,2,183,4,low,M,48,3374,1
4,B,Flight,3,3,176,4,medium,M,10,1177,1
5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [27]:
df = raw_data_df.copy()

In [28]:
# Check for missing values
df.isnull().sum()

Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
Reached.on.Time_Y.N    0
dtype: int64

In [29]:
# Check for duplicates
print(df.duplicated().sum())

0


In [30]:
print(df.dtypes)

Warehouse_block        object
Mode_of_Shipment       object
Customer_care_calls     int64
Customer_rating         int64
Cost_of_the_Product     int64
Prior_purchases         int64
Product_importance     object
Gender                 object
Discount_offered        int64
Weight_in_gms           int64
Reached.on.Time_Y.N     int64
dtype: object


In [31]:
# Convert categorical variables to category dtype
df["Warehouse_block"] = df["Warehouse_block"].astype("category")
df["Mode_of_Shipment"] = df["Mode_of_Shipment"].astype("category")
df["Product_importance"] = df["Product_importance"].astype("category")
df["Gender"] = df["Gender"].astype("category")

df

Unnamed: 0_level_0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,D,Flight,4,2,177,3,low,F,44,1233,1
2,F,Flight,4,5,216,2,low,M,59,3088,1
3,A,Flight,2,2,183,4,low,M,48,3374,1
4,B,Flight,3,3,176,4,medium,M,10,1177,1
5,C,Flight,2,2,184,3,medium,F,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...
10995,A,Ship,4,1,252,5,medium,F,1,1538,1
10996,B,Ship,4,1,232,5,medium,F,6,1247,0
10997,C,Ship,5,4,242,5,low,F,4,1155,0
10998,F,Ship,5,2,223,6,medium,M,2,1210,0


In [32]:
bronze_df = df.copy()

In [33]:
from sklearn.preprocessing import StandardScaler

# Create list of numeric and categorical columns

numeric_columns = [
    "Customer_care_calls",
    "Cost_of_the_Product",
    "Prior_purchases",
    "Discount_offered",
    "Weight_in_gms",
]

OHE_columns = [
    "Warehouse_block",
    "Mode_of_Shipment",
    "Customer_rating",
    "Product_importance",
    "Gender",
]

# Use standard scaler for numeric columns
scaler = StandardScaler()
bronze_df[numeric_columns] = scaler.fit_transform(bronze_df[numeric_columns])

# Use one-hot encoding for categorical columns
bronze_df = pd.get_dummies(bronze_df, columns=OHE_columns, dtype=int)

bronze_df.head()

Unnamed: 0_level_0,Customer_care_calls,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,...,Customer_rating_1,Customer_rating_2,Customer_rating_3,Customer_rating_4,Customer_rating_5,Product_importance_high,Product_importance_low,Product_importance_medium,Gender_F,Gender_M
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.047711,-0.690722,-0.372735,1.889983,-1.46824,1,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
2,-0.047711,0.120746,-1.029424,2.815636,-0.333893,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
3,-1.799887,-0.565881,0.283954,2.136824,-0.159002,1,1,0,0,0,...,0,1,0,0,0,0,1,0,0,1
4,-0.923799,-0.711529,0.283954,-0.208162,-1.502484,1,0,1,0,0,...,0,0,1,0,0,0,0,1,0,1
5,-1.799887,-0.545074,-0.372735,2.013404,-0.703244,1,0,0,1,0,...,0,1,0,0,0,0,0,1,1,0


In [34]:
# Save the processed data
bronze_df.to_csv("logistic_optimization/ml/data/processed_data.csv", index=False)

In [35]:
# Train a Random Forest Classifier
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

gold_df = pd.read_csv("logistic_optimization/ml/data/processed_data.csv", index_col=0)


# Split the data into features and target
X = bronze_df.drop(columns=["Reached.on.Time_Y.N"])
y = bronze_df["Reached.on.Time_Y.N"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train the model
rf = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


joblib.dump(rf, "logistic_optimization/ml/model/rf_model.pkl")

Accuracy: 0.6509090909090909
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.67      0.61       887
           1       0.74      0.64      0.69      1313

    accuracy                           0.65      2200
   macro avg       0.65      0.65      0.65      2200
weighted avg       0.67      0.65      0.65      2200



['logistic_optimization/ml/model/rf_model.pkl']

In [36]:
# Creating a pipeline for reproducibility
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

raw_data_df = pd.read_csv("logistic_optimization/ml/data/raw/data.csv", index_col=0)

# Define the preprocessing steps
numeric_columns = [
    "Customer_care_calls",
    "Cost_of_the_Product",
    "Prior_purchases",
    "Discount_offered",
    "Weight_in_gms",
]

OHE_columns = [
    "Warehouse_block",
    "Mode_of_Shipment",
    "Customer_rating",
    "Product_importance",
    "Gender",
]

# Define the preprocessing for numeric features
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

# Define the preprocessing for categorical features
categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, OHE_columns),
    ]
)

# Define the model
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "classifier",
            RandomForestClassifier(
                n_estimators=100, class_weight="balanced", random_state=42
            ),
        ),
    ]
)

# Split the data into features and target
X = raw_data_df.drop(columns=["Reached.on.Time_Y.N"])
y = raw_data_df["Reached.on.Time_Y.N"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


joblib.dump(model, "logistic_optimization/ml/model/rf_model_pipeline_final.pkl")

Accuracy: 0.6504545454545455
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.67      0.61       887
           1       0.74      0.64      0.69      1313

    accuracy                           0.65      2200
   macro avg       0.65      0.65      0.65      2200
weighted avg       0.67      0.65      0.65      2200



['logistic_optimization/ml/model/rf_model_pipeline_final.pkl']