## Inital Scoring Model 

This is the first model I will make that will combine xgboost and lightgbm to get the feature importance to show that the features that we picked are actually important

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tkinter import Tk
from tkinter.filedialog import askopenfilename
from tkinter.filedialog import asksaveasfilename

In [None]:
# Load clean file with all features enginnered for jan-feb
Tk().withdraw()
file_path = askopenfilename(title="Select Jan–Feb Cleaned with Features CSV")
df = pd.read_csv(file_path)

print("Loaded:", file_path)
print(df.shape)

## XGBOOST

In [2]:
# Load file
Tk().withdraw()
file_path = askopenfilename(title="Select Cleaned CSV with Hotspot Score")
df = pd.read_csv(file_path)
print("Loaded:", file_path)
print("Shape:", df.shape)

# Step 1: Set your target and features
target_col = 'fare_per_minute'
categorical_cols = ['pickup_borough', 'dropoff_borough', 'time_of_day', 'is_airport_trip']
numeric_cols = ['fare_per_mile', 'dropoff_zone_hotness', 'trip_duration_variability', 'hotspot_score']
feature_cols = categorical_cols + numeric_cols

# Step 2: Ensure date column is parsed and clean
df['pickup_date'] = pd.to_datetime(df['pickup_date'])
df = df.dropna(subset=[target_col])  # Drop rows with missing target

# Step 3: Time-based split (Jan → train, Feb → test)
train_df = df[df['pickup_date'].dt.month == 1].copy()
test_df = df[df['pickup_date'].dt.month == 2].copy()

# Step 4: Preprocess features
X_train_cat = pd.get_dummies(train_df[categorical_cols], drop_first=True)
X_test_cat = pd.get_dummies(test_df[categorical_cols], drop_first=True)

# Align test set columns to training set
X_test_cat = X_test_cat.reindex(columns=X_train_cat.columns, fill_value=0)

X_train = pd.concat([train_df[numeric_cols].reset_index(drop=True), X_train_cat.reset_index(drop=True)], axis=1)
X_test = pd.concat([test_df[numeric_cols].reset_index(drop=True), X_test_cat.reset_index(drop=True)], axis=1)

y_train = train_df[target_col]
y_test = test_df[target_col]

# Step 5: Train model
model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

# Step 7: Output results
print(f"Time-based Scoring Model Evaluation (Train: Jan, Test: Feb):")
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print("\nTop Features:")
print(feature_importance.head(10))


Loaded: C:/diksha/Summer Sem/DataAnalysis/Data/Full Scoring Model/jan_feb_all_features.csv
Shape: (5609910, 32)
Time-based Scoring Model Evaluation (Train: Jan, Test: Feb):
R² Score: 0.4243
MAE: 0.1752

Top Features:
dropoff_borough_EWR          0.257927
fare_per_mile                0.204493
is_airport_trip              0.094014
trip_duration_variability    0.072462
time_of_day_Evening Rush     0.070376
dropoff_borough_Brooklyn     0.068495
time_of_day_Night            0.042064
time_of_day_Midday           0.034612
pickup_borough_Queens        0.032801
dropoff_borough_Manhattan    0.029399
dtype: float32


## LIGHTGBM

In [3]:
# Load file
Tk().withdraw()
file_path = askopenfilename(title="Select Cleaned CSV with Hotspot Score")
df = pd.read_csv(file_path)
print("Loaded:", file_path)
print("Shape:", df.shape)

# Step 1: Set target and features
target_col = 'fare_per_minute'
categorical_cols = ['pickup_borough', 'dropoff_borough', 'time_of_day', 'is_airport_trip']
numeric_cols = ['fare_per_mile', 'dropoff_zone_hotness', 'trip_duration_variability', 'hotspot_score']
feature_cols = categorical_cols + numeric_cols

# Step 2: Parse pickup_date
df['pickup_date'] = pd.to_datetime(df['pickup_date'])
df = df.dropna(subset=[target_col])

# Step 3: Time-based split
train_df = df[df['pickup_date'].dt.month == 1].copy()
test_df = df[df['pickup_date'].dt.month == 2].copy()

# Step 4: Encode categorical features
X_train_cat = pd.get_dummies(train_df[categorical_cols], drop_first=True)
X_test_cat = pd.get_dummies(test_df[categorical_cols], drop_first=True)
X_test_cat = X_test_cat.reindex(columns=X_train_cat.columns, fill_value=0)

# Combine with numeric
X_train = pd.concat([train_df[numeric_cols].reset_index(drop=True), X_train_cat.reset_index(drop=True)], axis=1)
X_test = pd.concat([test_df[numeric_cols].reset_index(drop=True), X_test_cat.reset_index(drop=True)], axis=1)

y_train = train_df[target_col]
y_test = test_df[target_col]

# Step 5: Train LightGBM
model = LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Step 6: Predict and evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

# Step 7: Output results
print(f"LightGBM Time-Based Model (Train: Jan, Test: Feb):")
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print("\nTop Features:")
print(feature_importance.head(10))


Loaded: C:/diksha/Summer Sem/DataAnalysis/Data/Full Scoring Model/jan_feb_all_features.csv
Shape: (5609910, 32)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 793
[LightGBM] [Info] Number of data points in the train set: 2871008, number of used features: 17
[LightGBM] [Info] Start training from score 1.262351
LightGBM Time-Based Model (Train: Jan, Test: Feb):
R² Score: 0.4091
MAE: 0.1776

Top Features:
fare_per_mile                1244
trip_duration_variability     555
dropoff_zone_hotness          388
time_of_day_Evening Rush      111
is_airport_trip               101
time_of_day_Midday             86
time_of_day_Night              79
pickup_borough_Queens          73
dropoff_borough_Manhattan      70
time_of_day_Morning Rush       69
dtype: int32
