# **Feature Engineering**

Load Data & Inspect Structure

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read data
df = pd.read_csv("/content/drive/MyDrive/Datasets/uber_cleaned.csv")

In [3]:
# View shape
df.shape

(150000, 34)

In [4]:
# Show first rows
df.head()

Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Avg VTAT,Avg CTAT,...,Weekday,Payment Method_missing,Avg CTAT_missing,Avg VTAT_missing,Cancelled Rides by Customer_missing,Cancelled Rides by Driver_missing,Incomplete Rides_missing,Reason for cancelling by Customer_missing,Driver Cancellation Reason_missing,Incomplete Rides Reason_missing
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,-1.0,-1.0,...,5,1,1,1,1,1,1,1,1,1
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,...,4,0,0,0,1,1,0,1,1,0
2,2024-08-23,08:56:10,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,25.8,...,4,0,0,0,1,1,1,1,1,1
3,2024-10-21,17:17:25,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,28.5,...,0,0,0,0,1,1,1,1,1,1
4,2024-09-16,22:08:00,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,19.6,...,0,0,0,0,1,1,1,1,1,1


In [5]:
# Show info (dtypes + missing)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 34 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   Date                                       150000 non-null  object 
 1   Time                                       150000 non-null  object 
 2   Booking ID                                 150000 non-null  object 
 3   Booking Status                             150000 non-null  object 
 4   Customer ID                                150000 non-null  object 
 5   Vehicle Type                               150000 non-null  object 
 6   Pickup Location                            150000 non-null  object 
 7   Drop Location                              150000 non-null  object 
 8   Avg VTAT                                   150000 non-null  float64
 9   Avg CTAT                                   150000 non-null  float64
 10  Cancelle

In [6]:
# Show statistical description for numeric columns
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Avg VTAT,150000.0,7.794407,4.366285,-1.0,4.7,7.8,11.0,20.0
Avg CTAT,150000.0,19.501753,15.864857,-1.0,-1.0,22.0,32.9,45.0
Cancelled Rides by Customer,150000.0,0.07,0.255148,0.0,0.0,0.0,0.0,1.0
Cancelled Rides by Driver,150000.0,0.18,0.384189,0.0,0.0,0.0,0.0,1.0
Incomplete Rides,150000.0,0.06,0.237488,0.0,0.0,0.0,0.0,1.0
Booking Value,150000.0,508.295912,326.389291,50.0,319.75,508.295912,521.0,4277.0
Ride Distance,150000.0,24.637012,11.546441,1.0,17.36,24.637012,30.65,50.0
Driver Ratings,150000.0,4.257215,0.345619,3.0,4.2,4.3,4.3,5.0
Customer Rating,150000.0,4.440842,0.347835,3.0,4.3,4.5,4.6,5.0
Hour,150000.0,14.034113,5.416906,0.0,10.0,15.0,18.0,23.0


In [7]:
# Value counts for categorical columns
for col in df.select_dtypes(include='object').columns:
    print(f"\n---- {col} ----")
    print(df[col].value_counts().head(10))


---- Date ----
Date
2024-11-16    462
2024-05-09    456
2024-09-18    456
2024-10-12    452
2024-01-26    452
2024-02-06    452
2024-07-17    451
2024-10-09    451
2024-10-13    451
2024-11-29    450
Name: count, dtype: int64

---- Time ----
Time
17:44:57    16
19:17:33    12
17:54:33    11
11:29:50    11
10:23:23    11
17:55:08    11
15:23:56    11
18:59:55    11
19:26:48    11
18:10:37    10
Name: count, dtype: int64

---- Booking ID ----
Booking ID
"CNR6337479"    3
"CNR7199036"    3
"CNR3648267"    3
"CNR2726142"    3
"CNR5292943"    3
"CNR7642097"    3
"CNR7585544"    3
"CNR7908610"    3
"CNR9603232"    3
"CNR1130025"    2
Name: count, dtype: int64

---- Booking Status ----
Booking Status
Completed                93000
Cancelled by Driver      27000
No Driver Found          10500
Cancelled by Customer    10500
Incomplete                9000
Name: count, dtype: int64

---- Customer ID ----
Customer ID
"CID6715450"    3
"CID5481002"    3
"CID4523979"    3
"CID6468528"    3
"CID8727

In [8]:
# Convert date column
df['Date'] = pd.to_datetime(df['Date'])

# Convert time column
df['Time'] = pd.to_datetime(df['Time'], format="%H:%M:%S").dt.time

New datetime features

In [9]:
# Convert Time back to datetime to extract hour if needed
df['Time_dt'] = pd.to_datetime(df['Time'].astype(str))

# Weekend
df['Is_Weekend'] = df['Weekday'].isin([5, 6]).astype(int)

# Peak hour
df['Is_Peak_Hour'] = df['Hour'].isin([7,8,9,17,18,19]).astype(int)

# Month features
df['Is_Month_Start'] = df['Date'].dt.is_month_start.astype(int)
df['Is_Month_End'] = df['Date'].dt.is_month_end.astype(int)

# Season
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

df['Season'] = df['Month'].apply(get_season)


  df['Time_dt'] = pd.to_datetime(df['Time'].astype(str))


Encoding Categorical Features

ðŸŽ¯ Columns with LOW categories â†’ One-Hot

Payment Method

Vehicle Type

Booking Status (target)

Season

ðŸŽ¯ Columns with MED categories â†’ Label Encoding

Reason for cancelling by Customer

Driver Cancellation Reason

Incomplete Rides Reason

ðŸŽ¯ Columns with HIGH categories â†’ Frequency Encoding

Pickup Location

Drop Location

In [10]:
#Label Encoding for Reason Columns
from sklearn.preprocessing import LabelEncoder

reason_cols = [
    'Reason for cancelling by Customer',
    'Driver Cancellation Reason',
    'Incomplete Rides Reason'
]

le = LabelEncoder()
for col in reason_cols:
    df[col] = le.fit_transform(df[col])


In [11]:
#One-Hot Encoding for Low-cardinality
one_hot_cols = ['Payment Method', 'Vehicle Type', 'Season']

df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)


In [12]:
#Frequency Encoding for High-cardinality
high_card_cols = ['Pickup Location', 'Drop Location']

for col in high_card_cols:
    freq = df[col].value_counts() / len(df)
    df[col + "_freq"] = df[col].map(freq)

# Drop original text columns
df.drop(high_card_cols, axis=1, inplace=True)


Transform numeric

In [17]:
#VTAT/CTAT are "time duration" â†’ natural median imputation
# Replace -1 in VTAT/CTAT
df['Avg VTAT'] = df['Avg VTAT'].replace(-1, df['Avg VTAT'].median())
df['Avg CTAT'] = df['Avg CTAT'].replace(-1, df['Avg CTAT'].median())

# Log transform heavy skew
df['Booking Value_log'] = np.log1p(df['Booking Value'])
df['Ride Distance_log'] = np.log1p(df['Ride Distance'])


# **Feature Selection**

Remove obviously useless features (Filter stage)

In [18]:
drop_cols = [
    'Booking ID',
    'Customer ID',
    'Time',
    'Date',
    'Time_dt'
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns])


Drop constant / quasi-constant features

(Features with almost same value across all rows)

In [19]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.01)
selector.fit(df.select_dtypes(include=['int64', 'float64']))


In [21]:
df.columns

Index(['Booking Status', 'Avg VTAT', 'Avg CTAT', 'Cancelled Rides by Customer',
       'Reason for cancelling by Customer', 'Cancelled Rides by Driver',
       'Driver Cancellation Reason', 'Incomplete Rides',
       'Incomplete Rides Reason', 'Booking Value', 'Ride Distance',
       'Driver Ratings', 'Customer Rating', 'Hour', 'Day', 'Month', 'Weekday',
       'Payment Method_missing', 'Avg CTAT_missing', 'Avg VTAT_missing',
       'Cancelled Rides by Customer_missing',
       'Cancelled Rides by Driver_missing', 'Incomplete Rides_missing',
       'Reason for cancelling by Customer_missing',
       'Driver Cancellation Reason_missing', 'Incomplete Rides Reason_missing',
       'Is_Weekend', 'Is_Peak_Hour', 'Is_Month_Start', 'Is_Month_End',
       'Payment Method_Credit Card', 'Payment Method_Debit Card',
       'Payment Method_UPI', 'Payment Method_Uber Wallet',
       'Payment Method_none', 'Vehicle Type_Bike', 'Vehicle Type_Go Mini',
       'Vehicle Type_Go Sedan', 'Vehicle Type_Pre

In [22]:
df.to_csv("uber_cleaned_to_train.csv", index=False)