# Machine Learning

Every cancelled ride is lost revenue and wasted time

By predicting ride cancellations before they happen, ride-hailing platforms can:

- Improve customer experience
- Minimize lost revenue

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [3]:
data2= pd.read_csv('final.csv', index_col= 0)

In [4]:
numeric= list(data2.describe().columns)
numeric

['Avg VTAT',
 'Avg CTAT',
 'Cancelled Rides by Customer',
 'Cancelled Rides by Driver',
 'Incomplete Rides',
 'Booking Value',
 'Ride Distance',
 'Driver Ratings',
 'Customer Rating',
 'Month',
 'Day',
 'Hour',
 'Is Weekend']

In [5]:
data2[numeric]= data2[numeric].fillna(data2[numeric].mean())

In [6]:
data2['Booking Status'].unique()

array(['No Driver Found', 'Incomplete', 'Completed',
       'Cancelled by Driver', 'Cancelled by Customer'], dtype=object)

In [7]:
data2['is_cancelled'] = data2['Booking Status'].apply(
    lambda x: 0 if x == 'Completed' else 1
)
data2['is_cancelled'].value_counts()

is_cancelled
0    92248
1    56519
Name: count, dtype: int64

In [8]:
data_categorical = data2.select_dtypes(include= ['object','category'])
data_categorical.head()

Unnamed: 0,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Reason for cancelling by Customer,Driver Cancellation Reason,Incomplete Rides Reason,Payment Method,time_of_day
0,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,,,,,Morning
1,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,,,Vehicle Breakdown,UPI,Afternoon
2,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,,,,Debit Card,Morning
3,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,,,,UPI,Afternoon
4,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,,,,UPI,Evening


In [9]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

ecoder = LabelEncoder()
for col in data_categorical.columns:
    data2[col] = ecoder.fit_transform(data_categorical[col])

In [10]:
data2.columns

Index(['Booking ID', 'Booking Status', 'Customer ID', 'Vehicle Type',
       'Pickup Location', 'Drop Location', 'Avg VTAT', 'Avg CTAT',
       'Cancelled Rides by Customer', 'Reason for cancelling by Customer',
       'Cancelled Rides by Driver', 'Driver Cancellation Reason',
       'Incomplete Rides', 'Incomplete Rides Reason', 'Booking Value',
       'Ride Distance', 'Driver Ratings', 'Customer Rating', 'Payment Method',
       'Month', 'Day', 'Hour', 'Is Weekend', 'time_of_day', 'is_cancelled'],
      dtype='object')

In [11]:
data2= data2.drop({'Booking ID', 'Customer ID', 'Pickup Location', 'Drop Location', 'Booking Status', 'Cancelled Rides by Customer', 'Reason for cancelling by Customer',
       'Cancelled Rides by Driver', 'Driver Cancellation Reason',
       'Incomplete Rides', 'Incomplete Rides Reason'}, axis= 1)

In [32]:
X = data2.drop('is_cancelled', axis=1)
y = data2['is_cancelled']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42, stratify= y)

In [33]:
X_train

Unnamed: 0,Vehicle Type,Avg VTAT,Avg CTAT,Booking Value,Ride Distance,Driver Ratings,Customer Rating,Payment Method,Month,Day,Hour,Is Weekend,time_of_day
34079,0,2.300000,44.100000,639.00000,8.450000,3.350000,4.100000,0,5,10,22,0,1
113697,1,6.600000,34.300000,654.00000,15.130000,4.500000,4.300000,3,4,11,16,0,0
12648,6,6.600000,29.150249,508.29023,24.640956,4.240752,4.408932,5,2,8,18,0,0
141685,1,4.700000,29.150249,508.29023,24.640956,4.240752,4.408932,5,5,24,12,0,2
143131,1,14.200000,16.700000,779.00000,12.320000,5.000000,3.800000,4,2,13,16,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67144,3,8.454819,29.150249,508.29023,24.640956,4.240752,4.408932,5,6,26,16,0,0
70434,3,8.000000,29.150249,508.29023,24.640956,4.240752,4.408932,5,9,5,8,0,2
139396,1,4.100000,18.900000,427.00000,9.700000,4.200000,3.700000,3,12,8,21,1,1
64069,2,8.454819,29.150249,508.29023,24.640956,4.240752,4.408932,5,11,14,19,0,1


In [34]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### LogisticRegression

In [35]:
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9096592054849768


### KNeighbors Classifier

In [36]:
from sklearn.neighbors import KNeighborsClassifier

In [37]:
model= KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9687773072528063
