### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score,recall_score,f1_score

### Importing dataset

In [2]:
data=pd.read_csv('train.csv')

In [3]:
data.columns

Index(['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Reached.on.Time_Y.N'],
      dtype='object')

In [4]:
X=data.drop(columns=['Reached.on.Time_Y.N'])
y=data['Reached.on.Time_Y.N']

In [5]:
X.dtypes

ID                      int64
Warehouse_block        object
Mode_of_Shipment       object
Customer_care_calls     int64
Customer_rating         int64
Cost_of_the_Product     int64
Prior_purchases         int64
Product_importance     object
Gender                 object
Discount_offered        int64
Weight_in_gms           int64
dtype: object

### Label Encoding for categorical features

#### First approach: Label Encoding

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

data['Warehouse_block_label']=le.fit_transform(data['Warehouse_block'])
data['Mode_of_Shipment_label']=le.fit_transform(data['Mode_of_Shipment'])
data['Product_importance_label']=le.fit_transform(data['Product_importance'])
data['Gender_label']=le.fit_transform(data['Gender'])

data.drop(columns=['Warehouse_block','Mode_of_Shipment','Product_importance','Gender'])

This will replace each string category with an integer (e.g., F → 0, M → 1).

But Logistic Regression might “think” that 0 < 1 < 2 has an order, which isn’t true for categories like Mode_of_Shipment (Ship, Flight, Road). So LabelEncoder can mislead the model.

#### Second approach: OneHotEncoder / pd.get_dummies()

In [6]:
X=pd.get_dummies(data.drop(columns=['ID']), drop_first=True)

### Train test split

In [7]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2, random_state=42, stratify=y)

In [8]:
X_train.shape

(8799, 16)

In [9]:
X_test.shape

(2200, 16)

### Fit the model

In [23]:
X_train.columns

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

Our Logistic Regression is throwing a ConvergenceWarning

Logistic regression converges faster if features are on the same scale. So I used StandardScaler

In [10]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

model=LogisticRegression(max_iter=500)
model.fit(X_train, y_train)


In [11]:
y_pred=model.predict(X_test)
y_pred

array([1, 1, 0, ..., 1, 0, 1])

In [12]:
accuracy_score(y_test,y_pred)

1.0

In [13]:
confusion_matrix(y_test,y_pred)

array([[ 887,    0],
       [   0, 1313]])

In [14]:
precision_score(y_test,y_pred)

1.0

In [15]:
recall_score(y_test,y_pred)

1.0

In [16]:
f1=f1_score(y_test,y_pred)

In [17]:
print(f1)

1.0


In [18]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       887
           1       1.00      1.00      1.00      1313

    accuracy                           1.00      2200
   macro avg       1.00      1.00      1.00      2200
weighted avg       1.00      1.00      1.00      2200



In [19]:
print(model.score(X_train, y_train))

1.0


In [21]:
print(model.score(X_test, y_test))

1.0
