In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance


In [2]:

df = pd.read_csv("/content/drive/MyDrive/Datasets/uber_cleaned_to_train.csv")

In [3]:
df.dtypes

Unnamed: 0,0
Booking Status,object
Avg VTAT,float64
Avg CTAT,float64
Cancelled Rides by Customer,float64
Reason for cancelling by Customer,int64
Cancelled Rides by Driver,float64
Driver Cancellation Reason,int64
Incomplete Rides,float64
Incomplete Rides Reason,int64
Booking Value,float64


In [4]:

bool_cols = df.select_dtypes(include=["bool"]).columns
df[bool_cols] = df[bool_cols].astype("int8")

print(df.dtypes)

Booking Status                                object
Avg VTAT                                     float64
Avg CTAT                                     float64
Cancelled Rides by Customer                  float64
Reason for cancelling by Customer              int64
Cancelled Rides by Driver                    float64
Driver Cancellation Reason                     int64
Incomplete Rides                             float64
Incomplete Rides Reason                        int64
Booking Value                                float64
Ride Distance                                float64
Driver Ratings                               float64
Customer Rating                              float64
Hour                                           int64
Day                                            int64
Month                                          int64
Weekday                                        int64
Payment Method_missing                         int64
Avg CTAT_missing                              

In [5]:

y = df["Booking Status"]
X = df.drop(columns=["Booking Status"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() < 50 else None
)

# **A) Filter Methods (fast, model-agnostic)**

1) Variance Threshold (remove near-constant features)

In [6]:

vt = VarianceThreshold(threshold=0.01)
X_vt = vt.fit_transform(X_train)
selected_vt = X_train.columns[vt.get_support()].tolist()
selected_vt

['Avg VTAT',
 'Avg CTAT',
 'Cancelled Rides by Customer',
 'Reason for cancelling by Customer',
 'Cancelled Rides by Driver',
 'Driver Cancellation Reason',
 'Incomplete Rides',
 'Incomplete Rides Reason',
 'Booking Value',
 'Ride Distance',
 'Driver Ratings',
 'Customer Rating',
 'Hour',
 'Day',
 'Month',
 'Weekday',
 'Payment Method_missing',
 'Avg CTAT_missing',
 'Avg VTAT_missing',
 'Cancelled Rides by Customer_missing',
 'Cancelled Rides by Driver_missing',
 'Incomplete Rides_missing',
 'Reason for cancelling by Customer_missing',
 'Driver Cancellation Reason_missing',
 'Incomplete Rides Reason_missing',
 'Is_Weekend',
 'Is_Peak_Hour',
 'Is_Month_Start',
 'Is_Month_End',
 'Payment Method_Credit Card',
 'Payment Method_Debit Card',
 'Payment Method_UPI',
 'Payment Method_Uber Wallet',
 'Payment Method_none',
 'Vehicle Type_Bike',
 'Vehicle Type_Go Mini',
 'Vehicle Type_Go Sedan',
 'Vehicle Type_Premier Sedan',
 'Vehicle Type_Uber XL',
 'Vehicle Type_eBike',
 'Season_Spring',
 'Seas

2) Correlation Filter (drop highly correlated duplicates)

In [7]:
def drop_high_corr_features(X, threshold=0.95):
    corr = X.corr(numeric_only=True).abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
    return to_drop

to_drop_corr = drop_high_corr_features(X_train, threshold=0.95)
selected_corr = [c for c in X_train.columns if c not in to_drop_corr]
to_drop_corr[:10], len(to_drop_corr), len(selected_corr)


(['Avg CTAT_missing',
  'Cancelled Rides by Customer_missing',
  'Cancelled Rides by Driver_missing',
  'Incomplete Rides_missing',
  'Reason for cancelling by Customer_missing',
  'Driver Cancellation Reason_missing',
  'Incomplete Rides Reason_missing',
  'Payment Method_none'],
 8,
 39)

3) Univariate statistical selection

(a) Mutual Information (works for numeric features + classification)

In [8]:


mi = mutual_info_classif(X_train, y_train, random_state=42)
mi_scores = pd.Series(mi, index=X_train.columns).sort_values(ascending=False)

k = min(20, X_train.shape[1])  # choose top-k
skb_mi = SelectKBest(score_func=mutual_info_classif, k=k).fit(X_train, y_train)
selected_mi = X_train.columns[skb_mi.get_support()].tolist()

mi_scores.head(15), selected_mi


(Ride Distance                         0.691139
 Ride Distance_log                     0.688547
 Avg CTAT                              0.680517
 Avg CTAT_missing                      0.630850
 Payment Method_missing                0.630575
 Booking Value_log                     0.629261
 Payment Method_none                   0.629177
 Booking Value                         0.626595
 Customer Rating                       0.534408
 Cancelled Rides by Driver_missing     0.485545
 Driver Cancellation Reason_missing    0.483939
 Driver Cancellation Reason            0.480032
 Cancelled Rides by Driver             0.472501
 Driver Ratings                        0.429959
 Avg VTAT                              0.386157
 dtype: float64,
 ['Avg VTAT',
  'Avg CTAT',
  'Cancelled Rides by Customer',
  'Reason for cancelling by Customer',
  'Cancelled Rides by Driver',
  'Driver Cancellation Reason',
  'Booking Value',
  'Ride Distance',
  'Driver Ratings',
  'Customer Rating',
  'Payment Method_mis

(b) ANOVA F-test (classification) OR F-regression (regression)

In [9]:

skb_f = SelectKBest(score_func=f_classif, k=k).fit(X_train, y_train)
selected_f = X_train.columns[skb_f.get_support()].tolist()
selected_f


  f = msb / msw


['Avg VTAT',
 'Avg CTAT',
 'Cancelled Rides by Customer',
 'Reason for cancelling by Customer',
 'Cancelled Rides by Driver',
 'Driver Cancellation Reason',
 'Incomplete Rides',
 'Incomplete Rides Reason',
 'Payment Method_missing',
 'Avg CTAT_missing',
 'Avg VTAT_missing',
 'Cancelled Rides by Customer_missing',
 'Cancelled Rides by Driver_missing',
 'Incomplete Rides_missing',
 'Reason for cancelling by Customer_missing',
 'Driver Cancellation Reason_missing',
 'Incomplete Rides Reason_missing',
 'Payment Method_UPI',
 'Payment Method_none',
 'Ride Distance_log']

(c) Chi-Square (ONLY if all features are non-negative)

In [10]:

chi2_pipe = Pipeline([
    ("minmax", MinMaxScaler()),  # ensures non-negative
    ("chi2", SelectKBest(score_func=chi2, k=k))
])

chi2_pipe.fit(X_train, y_train)
selected_chi2 = X_train.columns[chi2_pipe.named_steps["chi2"].get_support()].tolist()
selected_chi2


['Avg CTAT',
 'Cancelled Rides by Customer',
 'Cancelled Rides by Driver',
 'Driver Cancellation Reason',
 'Incomplete Rides',
 'Incomplete Rides Reason',
 'Payment Method_missing',
 'Avg CTAT_missing',
 'Avg VTAT_missing',
 'Cancelled Rides by Customer_missing',
 'Cancelled Rides by Driver_missing',
 'Incomplete Rides_missing',
 'Reason for cancelling by Customer_missing',
 'Driver Cancellation Reason_missing',
 'Incomplete Rides Reason_missing',
 'Payment Method_Credit Card',
 'Payment Method_Debit Card',
 'Payment Method_UPI',
 'Payment Method_Uber Wallet',
 'Payment Method_none']

B) Wrapper Methods (search feature subsets using a model)

4) RFE (Recursive Feature Elimination)

In [11]:

est = LogisticRegression(max_iter=3000, n_jobs=None)
rfe = RFE(estimator=est, n_features_to_select=k)
rfe_pipe = Pipeline([("scaler", StandardScaler()), ("rfe", rfe)])

rfe_pipe.fit(X_train, y_train)
selected_rfe = X_train.columns[rfe_pipe.named_steps["rfe"].get_support()].tolist()
selected_rfe


['Avg VTAT',
 'Avg CTAT',
 'Cancelled Rides by Customer',
 'Reason for cancelling by Customer',
 'Cancelled Rides by Driver',
 'Driver Cancellation Reason',
 'Incomplete Rides',
 'Incomplete Rides Reason',
 'Payment Method_missing',
 'Avg CTAT_missing',
 'Avg VTAT_missing',
 'Cancelled Rides by Customer_missing',
 'Cancelled Rides by Driver_missing',
 'Incomplete Rides_missing',
 'Reason for cancelling by Customer_missing',
 'Driver Cancellation Reason_missing',
 'Incomplete Rides Reason_missing',
 'Payment Method_UPI',
 'Payment Method_Uber Wallet',
 'Payment Method_none']

5) Sequential Feature Selection (forward/backward)

In [12]:

sfs = SequentialFeatureSelector(
    estimator=LogisticRegression(max_iter=3000),
    n_features_to_select=k,
    direction="forward",   # or "backward"
    scoring="f1_macro",    # change metric if you want
    cv=5,
    n_jobs=-1
)

sfs_pipe = Pipeline([("scaler", StandardScaler()), ("sfs", sfs)])
sfs_pipe.fit(X_train, y_train)

selected_sfs = X_train.columns[sfs_pipe.named_steps["sfs"].get_support()].tolist()
selected_sfs


['Avg VTAT',
 'Avg CTAT',
 'Cancelled Rides by Customer',
 'Reason for cancelling by Customer',
 'Cancelled Rides by Driver',
 'Driver Cancellation Reason',
 'Incomplete Rides',
 'Incomplete Rides Reason',
 'Booking Value',
 'Ride Distance',
 'Driver Ratings',
 'Customer Rating',
 'Hour',
 'Day',
 'Month',
 'Weekday',
 'Payment Method_missing',
 'Avg CTAT_missing',
 'Avg VTAT_missing',
 'Cancelled Rides by Customer_missing']

C) Embedded Methods (selection happens during training)

6) L1 (Lasso) Logistic Regression (sparse weights)

In [13]:

l1 = LogisticRegression(penalty="l1", solver="liblinear", C=1.0, max_iter=3000)
l1_pipe = Pipeline([("scaler", StandardScaler()), ("model", l1)])
l1_pipe.fit(X_train, y_train)

coef = l1_pipe.named_steps["model"].coef_
# For multiclass, keep features that are non-zero in ANY class
nonzero = np.any(coef != 0, axis=0)
selected_l1 = X_train.columns[nonzero].tolist()
selected_l1


['Avg CTAT',
 'Cancelled Rides by Customer',
 'Reason for cancelling by Customer',
 'Cancelled Rides by Driver',
 'Driver Cancellation Reason',
 'Incomplete Rides',
 'Incomplete Rides Reason',
 'Ride Distance',
 'Payment Method_missing',
 'Avg CTAT_missing',
 'Avg VTAT_missing',
 'Cancelled Rides by Customer_missing',
 'Cancelled Rides by Driver_missing',
 'Incomplete Rides_missing',
 'Reason for cancelling by Customer_missing',
 'Driver Cancellation Reason_missing',
 'Incomplete Rides Reason_missing',
 'Payment Method_Credit Card',
 'Payment Method_Debit Card',
 'Payment Method_UPI',
 'Payment Method_Uber Wallet',
 'Payment Method_none']

7) Tree-based importance (RandomForest / XGBoost-like logic)

In [14]:

rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

imp = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
selected_rf = imp.head(k).index.tolist()

imp.head(15), selected_rf


(Avg CTAT_missing                             0.098416
 Payment Method_none                          0.094025
 Payment Method_missing                       0.091572
 Driver Cancellation Reason                   0.076620
 Cancelled Rides by Driver_missing            0.069871
 Driver Cancellation Reason_missing           0.067702
 Cancelled Rides by Driver                    0.064806
 Avg VTAT_missing                             0.048493
 Incomplete Rides Reason                      0.043857
 Incomplete Rides Reason_missing              0.042488
 Reason for cancelling by Customer            0.041478
 Incomplete Rides_missing                     0.040792
 Incomplete Rides                             0.038559
 Reason for cancelling by Customer_missing    0.032719
 Cancelled Rides by Customer_missing          0.030029
 dtype: float64,
 ['Avg CTAT_missing',
  'Payment Method_none',
  'Payment Method_missing',
  'Driver Cancellation Reason',
  'Cancelled Rides by Driver_missing',
  'Driver Ca

8) Permutation Importance (model-agnostic, more trustworthy than raw importance)

In [15]:

perm = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
perm_imp = pd.Series(perm.importances_mean, index=X_train.columns).sort_values(ascending=False)
selected_perm = perm_imp.head(k).index.tolist()

perm_imp.head(15), selected_perm


(Avg VTAT                             0.0
 Avg CTAT                             0.0
 Cancelled Rides by Customer          0.0
 Reason for cancelling by Customer    0.0
 Cancelled Rides by Driver            0.0
 Driver Cancellation Reason           0.0
 Incomplete Rides                     0.0
 Incomplete Rides Reason              0.0
 Booking Value                        0.0
 Ride Distance                        0.0
 Driver Ratings                       0.0
 Customer Rating                      0.0
 Hour                                 0.0
 Day                                  0.0
 Month                                0.0
 dtype: float64,
 ['Avg VTAT',
  'Avg CTAT',
  'Cancelled Rides by Customer',
  'Reason for cancelling by Customer',
  'Cancelled Rides by Driver',
  'Driver Cancellation Reason',
  'Incomplete Rides',
  'Incomplete Rides Reason',
  'Booking Value',
  'Ride Distance',
  'Driver Ratings',
  'Customer Rating',
  'Hour',
  'Day',
  'Month',
  'Weekday',
  'Payment Method

D) Put everything together (one dictionary of selected features)

In [16]:
results = {
    "variance_threshold": selected_vt,
    "corr_filtered": selected_corr,
    "mutual_info_topk": selected_mi,
    "anova_f_topk": selected_f,
    "chi2_topk": selected_chi2,
    "rfe": selected_rfe,
    "sfs": selected_sfs,
    "l1_embedded": selected_l1,
    "rf_topk": selected_rf,
    "permutation_topk": selected_perm
}

{m: len(v) for m, v in results.items()}


{'variance_threshold': 45,
 'corr_filtered': 39,
 'mutual_info_topk': 20,
 'anova_f_topk': 20,
 'chi2_topk': 20,
 'rfe': 20,
 'sfs': 20,
 'l1_embedded': 22,
 'rf_topk': 20,
 'permutation_topk': 20}