## Monthly Models

Last step when creating and making the model must have files that have both hotness and trip duration in the dataset this notebook will be used for the sliding window method to train all months in 2023 all files when run will be taken locally and saved locally.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import pickle
import json
import joblib
import os
from tkinter import Tk
from tkinter.filedialog import askopenfilename
from tkinter.filedialog import asksaveasfilename
import pytz

## Monthly Feature Engineering 

In [None]:
def run_feature_engineering_pipeline():
   # File selection
    Tk().withdraw()
    print("Select Month A CSV (used for training and variability calculation):")
    file_a = askopenfilename()
    print("Select Month B CSV (used to calculate hotness):")
    file_b = askopenfilename()

    # Read files safely as strings
    df_a = pd.read_csv(file_a, dtype=str)
    df_b = pd.read_csv(file_b, dtype=str)

    nyc_tz = pytz.timezone("America/New_York")

    for df in [df_a, df_b]:
        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')
        df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], errors='coerce')

        # Localize naive timestamps to NYC time (assumes original times are NYC)
        df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].dt.tz_localize(nyc_tz, ambiguous='NaT', nonexistent='shift_forward')
        df['tpep_dropoff_datetime'] = df['tpep_dropoff_datetime'].dt.tz_localize(nyc_tz, ambiguous='NaT', nonexistent='shift_forward')


    # # Confirm datetime parsing
    # print("Month A pickup dtype:", df_a['tpep_pickup_datetime'].dtype)
    # print("Month B pickup dtype:", df_b['tpep_pickup_datetime'].dtype)
    # print("Month B NaT values:", df_b['tpep_pickup_datetime'].isna().sum())

    # Extract month names early
    month_a = int(df_a['tpep_pickup_datetime'].dropna().dt.month.mode()[0])
    month_b = int(df_b['tpep_pickup_datetime'].dropna().dt.month.mode()[0])
    name_a = pd.to_datetime(f"2023-{month_a}-01").strftime('%b')
    name_b = pd.to_datetime(f"2023-{month_b}-01").strftime('%b')

    print(f"\nLoaded Month A ({name_a}): {df_a.shape[0]} rows")
    print(f"Loaded Month B ({name_b}): {df_b.shape[0]} rows")

    # Combine both months
    df_all = pd.concat([df_a, df_b], ignore_index=True)

    # -------------------------------------
    # Dropoff Zone Hotness Table
    # -------------------------------------
    df_all['pickup_day_of_week'] = df_all['tpep_pickup_datetime'].dt.dayofweek
    df_all['pickup_hour'] = df_all['tpep_pickup_datetime'].dt.hour

    df_hotness_source = df_all.dropna(subset=['pickup_day_of_week', 'pickup_hour', 'dropoff_zone'])

    hotness_table = (
        df_hotness_source
        .groupby(['dropoff_zone', 'pickup_day_of_week', 'pickup_hour'])
        .size()
        .reset_index(name='dropoff_zone_hotness')
    )

    # Save hotness table
    print("\nSelect where to save the hotness table")
    hotness_path = asksaveasfilename(
        initialfile=f"hotness_table_{name_b.lower()}.csv",
        defaultextension=".csv",
        filetypes=[("CSV files", "*.csv")]
    )
    if hotness_path:
        hotness_table.to_csv(hotness_path, index=False)
        print(f"Saved hotness table: {hotness_path}")
    else:
        print("Hotness table save cancelled.")

    # Apply hotness to a dataset
    def apply_hotness(df):
        df['dropoff_day_of_week'] = df['tpep_dropoff_datetime'].dt.dayofweek
        df['dropoff_hour'] = df['tpep_dropoff_datetime'].dt.hour
        df = df.merge(
            hotness_table,
            left_on=['dropoff_zone', 'dropoff_day_of_week', 'dropoff_hour'],
            right_on=['dropoff_zone', 'pickup_day_of_week', 'pickup_hour'],
            how='left'
        )
        df['dropoff_zone_hotness'] = df['dropoff_zone_hotness'].fillna(0)
        df['log_dropoff_zone_hotness'] = np.log1p(df['dropoff_zone_hotness'])
        df.drop(columns=['pickup_day_of_week', 'pickup_hour'], errors='ignore', inplace=True)
        return df

    df_a = apply_hotness(df_a)
    df_b = apply_hotness(df_b)

    # -------------------------------------
    # Trip Duration Variability Table
    # -------------------------------------
   # Ensure trip_duration_min is numeric
    df_all['trip_duration_min'] = pd.to_numeric(df_all['trip_duration_min'], errors='coerce')

    # Drop rows with missing duration before grouping
    duration_group_cols = ['pickup_zone', 'dropoff_zone', 'pickup_day_of_week', 'pickup_hour']
    duration_stats = (
        df_all.dropna(subset=['trip_duration_min'])
        .groupby(duration_group_cols)['trip_duration_min']
        .agg(['mean', 'std'])
        .reset_index()
        .rename(columns={'std': 'trip_duration_variability'})
    )

    # Save duration variability stats
    print("\nSelect where to save the trip duration stats table")
    duration_path = asksaveasfilename(
        initialfile=f"duration_variability_{name_b.lower()}.csv",
        defaultextension=".csv",
        filetypes=[("CSV files", "*.csv")]
    )
    if duration_path:
        duration_stats.to_csv(duration_path, index=False)
        print(f"Saved duration stats: {duration_path}")
    else:
        print("Duration stats save cancelled.")

    def apply_variability(df):
        df['pickup_day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek
        df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
        df = df.merge(duration_stats, on=duration_group_cols, how='left')
        df['trip_duration_variability'] = df['trip_duration_variability'].fillna(0)
        return df

    df_a = apply_variability(df_a)
    df_b = apply_variability(df_b)

    # -------------------------------------
    # Final Save
    # -------------------------------------
    df_a['pickup_date'] = df_a['tpep_pickup_datetime'].dt.date
    df_b['pickup_date'] = df_b['tpep_pickup_datetime'].dt.date
    df_final = pd.concat([df_a, df_b], ignore_index=True)

    print("\nSelect location to save the final dataset")
    final_path = asksaveasfilename(
        initialfile=f"Data_with_Features_{name_a}_{name_b}.csv",
        defaultextension=".csv",
        filetypes=[("CSV files", "*.csv")]
    )
    if final_path:
        df_final.to_csv(final_path, index=False)
        print(f"Final file saved: {final_path}")
        print(f"Final shape: {df_final.shape}")
    else:
        print("Final data save cancelled.")

# -------------------------------------
# RUN IT
# -------------------------------------
run_feature_engineering_pipeline()

Select Month A CSV (used for training and variability calculation):
Select Month B CSV (used to calculate hotness):

Loaded Month A (Nov): 3072802 rows
Loaded Month B (Dec): 3147996 rows

Select where to save the hotness table
Saved hotness table: C:/diksha/Summer Sem/ScoringModel/Models/December/hotness_table_dec.csv

Select where to save the trip duration stats table
Saved duration stats: C:/diksha/Summer Sem/ScoringModel/Models/December/duration_variability_nov_dec.csv

Select location to save the final dataset
Final file saved: C:/diksha/Summer Sem/ScoringModel/Data/Hotness and Duration/Data_with_Features_Nov_Dec.csv
Final shape: (6220798, 35)


## Scoring and Training Code (XGBoost + LightGBM)

In [None]:
# -------------------------------------
# 1. FEATURE PREP FUNCTION
# -------------------------------------
def prepare_model_data(df):
    df = df.copy()
    df['pickup_date'] = pd.to_datetime(df['pickup_date'])

    # sin/cos hour
    df['sin_hour'] = np.sin(2 * np.pi * df['pickup_hour'] / 24)
    df['cos_hour'] = np.cos(2 * np.pi * df['pickup_hour'] / 24)
    df.drop(columns=['pickup_hour', 'time_of_day'], inplace=True, errors='ignore')

    # Ensure fare_per_minute is usable
    df = df.dropna(subset=['fare_per_minute'])  # required target

    return df

# -------------------------------------
# 2. TRAIN + EVAL FUNCTION
# -------------------------------------
def train_model(train_df, test_df, model_type='xgb', month_str=None):
    categorical_cols = ['is_airport_trip', 'pickup_borough', 'dropoff_borough']
    numeric_cols = [
        'dropoff_zone_hotness', 'is_weekend',
        'trip_duration_variability', 'sin_hour', 'cos_hour'
    ]

    # One-hot encode
    X_train_cat = pd.get_dummies(train_df[categorical_cols], drop_first=True)
    X_test_cat = pd.get_dummies(test_df[categorical_cols], drop_first=True)
    X_test_cat = X_test_cat.reindex(columns=X_train_cat.columns, fill_value=0)

    X_train = pd.concat([train_df[numeric_cols].reset_index(drop=True), X_train_cat.reset_index(drop=True)], axis=1)
    X_test = pd.concat([test_df[numeric_cols].reset_index(drop=True), X_test_cat.reset_index(drop=True)], axis=1)
    y_train = train_df['fare_per_minute']
    y_test = test_df['fare_per_minute']

    # Save expected columns
    os.makedirs("ScoringModel/Models/expected_columns", exist_ok=True)
    expected_cols_path = f"ScoringModel/Models/expected_columns/expected_columns_{model_type}.pkl"
    joblib.dump(X_train.columns.tolist(), expected_cols_path)

    # Save raw X_train (optional)
    if month_str:
        os.makedirs(f"Models/{month_str}", exist_ok=True)
        with open(f"Models/{month_str}/X_train_{month_str}.pkl", "wb") as f:
            pickle.dump(X_train, f)

    # Train model
    model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1) if model_type == 'xgb' else LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n {model_type.upper()} Model Results:")
    print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")

    importance = model.feature_importances_
    return model, dict(zip(X_train.columns, importance))

# -------------------------------------
# 3. SCORING FUNCTION
# -------------------------------------
def apply_scoring(df, model):
    df = df.copy()

    # Define columns used in training
    categorical_cols = ['is_airport_trip', 'pickup_borough', 'dropoff_borough']
    numeric_cols = ['dropoff_zone_hotness', 'trip_duration_variability', 'sin_hour', 'cos_hour', 'is_weekend']

    # One-hot encode categorical features
    df_cat = pd.get_dummies(df[categorical_cols], drop_first=True)

    # Align columns with model expectations
    for col in model.feature_names_in_:
        if col not in df_cat.columns and col not in df.columns:
            df[col] = 0

    X = pd.concat([df[numeric_cols], df_cat], axis=1)

    # Reindex to match model input
    X = X.reindex(columns=model.feature_names_in_, fill_value=0)

    # Predict
    df['predicted_score'] = model.predict(X)

    # Normalize using 5th–95th percentile
    p_min = df['predicted_score'].quantile(0.05)
    p_max = df['predicted_score'].quantile(0.95)
    df['final_score'] = ((df['predicted_score'] - p_min) / (p_max - p_min)).clip(0, 1)

    # Save scaler range
    scaler = {"min": float(p_min), "max": float(p_max)}

    print(f"\n Score Normalization Range (5th–95th percentile): min = {p_min:.2f}, max = {p_max:.2f}")
    print(f" Raw Predicted Score Stats:\n  Min: {df['predicted_score'].min():.4f}, Max: {df['predicted_score'].max():.4f}")
    print(f" Saved normalization range: min = {scaler['min']:.2f}, max = {scaler['max']:.2f}")

    return df, scaler




# -------------------------------------
# 4. RUN PIPELINE
# -------------------------------------
def run_model_pipeline():
    Tk().withdraw()
    file_path = askopenfilename(title="Select Cleaned CSV with Features")
    df = pd.read_csv(file_path)
    print("Loaded:", file_path)

    df = prepare_model_data(df)
   
    df['pickup_date'] = pd.to_datetime(df['pickup_date'], errors='coerce')
    df = df.dropna(subset=['pickup_date'])

    # Get train/test months
    month_order = df.groupby(df['pickup_date'].dt.month)['pickup_date'].min().sort_values().index.tolist()
    month_a_num, month_b_num = month_order[0], month_order[1]
    train_df = df[df['pickup_date'].dt.month == month_a_num]
    test_df = df[df['pickup_date'].dt.month == month_b_num]
    month_str = pd.to_datetime(f'2023-{month_b_num}-01').strftime('%B').lower()

    print(" Training months:", month_a_num, "→", month_b_num)
    print("Train size:", train_df.shape[0], "| Test size:", test_df.shape[0])

    # Train XGB
    xgb_model, xgb_feats = train_model(train_df, test_df, model_type='xgb', month_str=month_str)
    xgb_path = asksaveasfilename(initialfile=f"model_{month_str}_xgb.pkl", defaultextension=".pkl")
    if xgb_path:
        with open(xgb_path, "wb") as f:
            pickle.dump(xgb_model, f)
        print(" XGB model saved to:", xgb_path)

    # Train LGB
    lgb_model, lgb_feats = train_model(train_df, test_df, model_type='lgb', month_str=month_str)
    lgb_path = asksaveasfilename(initialfile=f"model_{month_str}_lgb.pkl", defaultextension=".pkl")
    if lgb_path:
        with open(lgb_path, "wb") as f:
            pickle.dump(lgb_model, f)
        print(" LGB model saved to:", lgb_path)

    # Feature importance
    lgb_series = pd.Series(lgb_feats)
    xgb_series = pd.Series(xgb_feats)
    combined_df = pd.concat([xgb_series, lgb_series / lgb_series.sum()], axis=1, keys=['xgb', 'lgb_norm']).fillna(0)
    combined_df['avg_importance'] = combined_df.mean(axis=1)
    final_weights = combined_df['avg_importance'].sort_values(ascending=False)

    # Save weights
    weights_path = asksaveasfilename(initialfile=f"scoring_weights_{month_str}.json", defaultextension=".json")
    if weights_path:
        with open(weights_path, "w") as f:
            json.dump(final_weights.to_dict(), f)
        print(" Weights saved to:", weights_path)

    # Score + scale
    df_scored, scaler = apply_scoring(df, xgb_model)

    # Save scaler
    scaler_path = asksaveasfilename(initialfile=f"scaler_{month_str}.json", defaultextension=".json")

    if scaler_path:
        with open(scaler_path, "w") as f:
            json.dump(scaler, f)
        print(" Scaler saved to:", scaler_path)

# Run it
if __name__ == "__main__":
    run_model_pipeline()


Loaded: C:/diksha/Summer Sem/ScoringModel/Data/Hotness and Duration/Data_with_Features_Jun_Jul.csv
 Training months: 6 → 7
Train size: 3084442 | Test size: 2717185

 XGB Model Results:
R² Score: 0.3389
MAE: 0.1859
 XGB model saved to: C:/diksha/Summer Sem/ScoringModel/Models/July/model_july_xgb.pkl
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071694 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 573
[LightGBM] [Info] Number of data points in the train set: 3084442, number of used features: 16
[LightGBM] [Info] Start training from score 1.210974

 LGB Model Results:
R² Score: 0.3283
MAE: 0.1877
 LGB model saved to: C:/diksha/Summer Sem/ScoringModel/Models/July/model_july_lgb.pkl
 Weights saved to: C:/diksha/Summer Sem/ScoringModel/Models/July/scoring_weights_july.json

 Score Normalization Range (5th–95th percentile): min = 1.00, max