In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib
import time
import sys
import os

# Set random seed for reproducibility
np.random.seed(42)

# Print xgboost version for debugging
print(f"XGBoost version: {xgb.__version__}")

# Step 1: Load the full dataset
try:
    df = pd.read_csv('dart_mwendokasi_data.csv')
    print(f"Using full dataset with {df.shape[0]} records.")
    print(f"Dataset size: {df.shape}")
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure 'dart_mwendokasi_data.csv' is in the working directory.")
    sys.exit(1)
except Exception as e:
    print(f"Error loading dataset: {e}")
    sys.exit(1)

# Step 2: Preprocess the data
try:
    df = df.dropna()
    print("Data cleaned by removing NaN values.")
    
    # Convert original date to ordinal numbers first
    try:
        df['date_ordinal'] = pd.to_datetime(df['date'], errors='coerce').map(lambda x: x.toordinal() if pd.notna(x) else 0)
        print("Original date converted to ordinal numbers.")
    except Exception as e:
        print(f"Error converting original date to ordinal: {e}")
        sys.exit(1)

    # Convert date to day name for encoding
    if df['date'].dtype == 'object' or df['date'].dtype == '<M8[ns]':
        df['day'] = pd.to_datetime(df['date'], errors='coerce').dt.day_name()
        print("Date converted to day names for 'day' column.")
    else:
        print("Date column not a date type, assuming 'day' column exists with day names.")
    
    # Drop the original date column after extracting ordinal and day name
    df = df.drop(columns=['date'])
    
    # Encode categorical variables with their unique values
    encoders = {}
    for column in ['day', 'weather', 'peak_hours', 'weekends', 'holidays']:
        encoders[column] = LabelEncoder()
        df[column] = encoders[column].fit_transform(df[column])
        print(f"{column} encoded. Classes: {encoders[column].classes_}")

    # Preprocess time_value to handle inconsistent formats
    def convert_to_minutes(time_str):
        try:
            if pd.isna(time_str):
                return 0  # Default to midnight for missing values
            time_str = str(time_str).strip()
            if not time_str or time_str == ":00":
                return 0  # Handle ":00" or empty
            time_parts = time_str.split(':')
            if len(time_parts) >= 2:
                hour = int(time_parts[0]) if time_parts[0].isdigit() else 0
                minute = int(time_parts[1]) if time_parts[1].isdigit() else 0
                return max(0, min(1440, hour * 60 + minute))  # Cap at 24 hours (1440 minutes)
            return 0
        except Exception as e:
            print(f"Error converting time_value '{time_str}': {e}")
            return 0

    df['time_value'] = df['time_value'].apply(convert_to_minutes)
    print("Time_value converted to minutes since midnight.")
    print(f"Sample time_value values: {df['time_value'].head().tolist()}")

except Exception as e:
    print(f"Error in preprocessing: {e}")
    sys.exit(1)

# Step 3: Split features and target
try:
    X = df.drop(columns=['passengers'])
    training_feature_order = X.columns.tolist()  # Store the training feature order
    print("Training feature order:", training_feature_order)
    y = df['passengers']
    print("Features and target split successfully.")
except Exception as e:
    print(f"Error splitting features and target: {e}")
    sys.exit(1)

# Step 4: Train-test split
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f'Training set size: {X_train.shape}, Testing set size: {X_test.shape}')
except Exception as e:
    print(f"Error in train-test split: {e}")
    sys.exit(1)

# Step 5: Train XGBoost model with version-specific early stopping
try:
    start_time = time.time()
    model = xgb.XGBRegressor(
        n_estimators=400,
        max_depth=10,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1.0,
        n_jobs=-1,
        tree_method='hist',
        random_state=42,
        enable_categorical=False  # Explicitly disable for now, ensure all are numerical
    )
    print("Model initialized with hyperparameters.")

    major_version = int(xgb.__version__.split('.')[0])
    if major_version >= 1:
        try:
            # Ensure all columns are numerical after encoding
            X_train = X_train.astype({col: 'int' for col in X_train.select_dtypes(include=['int', 'float']).columns})
            X_test = X_test.astype({col: 'int' for col in X_test.select_dtypes(include=['int', 'float']).columns})
            model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                early_stopping_rounds=10,
                verbose=False
            )
            print("Model trained with early stopping.")
        except TypeError as e:
            print(f"Early stopping not supported, falling back to core API: {e}")
            dtrain = xgb.DMatrix(X_train, label=y_train)
            dtest = xgb.DMatrix(X_test, label=y_test)
            params = {
                'max_depth': 10,
                'learning_rate': 0.05,
                'objective': 'reg:squarederror',
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'reg_alpha': 0.1,
                'reg_lambda': 1.0,
                'seed': 42
            }
            evals = [(dtrain, 'train'), (dtest, 'test')]
            model_core = xgb.train(
                params, dtrain, num_boost_round=400, evals=evals,
                early_stopping_rounds=10, verbose_eval=False
            )
            model._Booster = model_core  # Update the booster for prediction
            print("Model trained with core API fallback.")
    else:
        print("Using core API for older xgboost version.")
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        params = {
            'max_depth': 10,
            'learning_rate': 0.05,
            'objective': 'reg:squarederror',
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.1,
            'reg_lambda': 1.0,
            'seed': 42
        }
        evals = [(dtrain, 'train'), (dtest, 'test')]
        model_core = xgb.train(
            params, dtrain, num_boost_round=400, evals=evals,
            early_stopping_rounds=10, verbose_eval=False
        )
        model._Booster = model_core  # Update the booster for prediction
        print("Model trained with core API.")

    print(f'Training time: {time.time() - start_time:.2f} seconds')
except Exception as e:
    print(f"Error training model: {e}")
    sys.exit(1)

# Step 6: Evaluate the model
try:
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Mean Squared Error (MSE): {mse:.2f}')
    print(f'R² Score: {r2:.4f}')

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    print(f'5-Fold CV R² Scores: {cv_scores}')
    print(f'Average CV R² Score: {cv_scores.mean():.4f}')
except Exception as e:
    print(f"Error evaluating model: {e}")
    sys.exit(1)

# Step 7: Feature importance
try:
    feature_importance = model.feature_importances_
    for feature, importance in zip(X.columns, feature_importance):
        print(f'Feature: {feature}, Importance: {importance:.4f}')
except Exception as e:
    print(f"Error calculating feature importance: {e}")
    sys.exit(1)

# Step 8: Save the model and encoders
try:
    joblib.dump(model, 'xgboost_model.pkl')  # Keep joblib for initial save
    model._Booster.save_model('xgboost_model.ubj')  # Export as UBJSON for compatibility
    for column, enc in encoders.items():
        joblib.dump(enc, f'{column}_encoder.pkl')
    print("Model and encoders saved as 'xgboost_model.pkl', 'xgboost_model.ubj', and individual encoder files")
except Exception as e:
    print(f"Error saving model or encoders: {e}")
    sys.exit(1)

# Optional: Classification for F-score
try:
    y_class = pd.cut(y, bins=[0, 50, 100, float('inf')], labels=['Low', 'Medium', 'High'])
    label_encoder_class = LabelEncoder()
    y_class_encoded = label_encoder_class.fit_transform(y_class)
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_class_encoded, test_size=0.2, random_state=42)
    model_class = xgb.XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        n_jobs=-1,
        tree_method='hist',
        random_state=42
    )
    model_class.fit(X_train_c, y_train_c)
    y_pred_c = model_class.predict(X_test_c)
    from sklearn.metrics import classification_report
    print('\nClassification Report (F-score):')
    print(classification_report(y_test_c, y_pred_c, target_names=['Low', 'Medium', 'High']))
except Exception as e:
    print(f"Error in classification: {e}")
    sys.exit(1)

# Function to predict passenger flow for LLM integration
def predict_passenger_flow(date_ordinal, time_minutes, day, weather, peak_hours, weekends, holidays):
    try:
        # Match the feature order from training DataFrame
        input_data = pd.DataFrame({
            'date_ordinal': [date_ordinal],
            'time_value': [time_minutes],
            'day': [encoders['day'].transform([day.title()])[0]],  # Convert to title case
            'weather': [encoders['weather'].transform([weather.title()])[0]],
            'peak_hours': [encoders['peak_hours'].transform([peak_hours.title()])[0]],
            'weekends': [encoders['weekends'].transform([weekends.title()])[0]],
            'holidays': [encoders['holidays'].transform([holidays.title()])[0]]
        }, columns=training_feature_order)  # Use the stored training feature order
        print("Prediction input order:", input_data.columns.tolist())  # Debug: Print input order
        prediction = model.predict(input_data)
        return prediction[0]
    except Exception as e:
        print(f"Error in prediction: {e}")
        return None

# Test function to evaluate model efficiency
def test_model_efficiency():
    test_cases = [
        (pd.to_datetime('2025-07-23').toordinal(), 8 * 60, 'Wednesday', 'Sunny', 'No', 'No', 'No'),  # 8 AM, weekday
        (pd.to_datetime('2025-07-23').toordinal(), 17 * 60, 'Friday', 'Rainy', 'Yes', 'No', 'No'),   # 5 PM, peak hour
        (pd.to_datetime('2025-07-26').toordinal(), 12 * 60, 'Sunday', 'Sunny', 'No', 'Yes', 'No'),  # Noon, weekend
    ]
    start_time = time.time()
    for i, (date_ordinal, time_minutes, day, weather, peak_hours, weekends, holidays) in enumerate(test_cases):
        pred = predict_passenger_flow(date_ordinal, time_minutes, day, weather, peak_hours, weekends, holidays)
        print(f"Test case {i+1}: {day} {time_minutes//60}:00, {weather}, Peak: {peak_hours}, Weekend: {weekends}, Holiday: {holidays} -> Predicted passengers: {pred}")
    end_time = time.time()
    print(f"Total time for {len(test_cases)} predictions: {end_time - start_time:.2f} seconds")
    print(f"Average time per prediction: {(end_time - start_time)/len(test_cases):.4f} seconds")

# Example usage and efficiency test
if __name__ == "__main__":
    try:
        today_ordinal = pd.to_datetime('2025-07-23').toordinal()
        print(f"Predicted flow for today at 6 PM: {predict_passenger_flow(today_ordinal, 18 * 60, 'Wednesday', 'Sunny', 'No', 'No', 'No')}")
        print("\nRunning efficiency test...")
        test_model_efficiency()
    except Exception as e:
        print(f"Error in example prediction or test: {e}")

XGBoost version: 3.0.1
Using full dataset with 500000 records.
Dataset size: (500000, 8)
Data cleaned by removing NaN values.
Original date converted to ordinal numbers.
Date converted to day names for 'day' column.
day encoded. Classes: ['Friday' 'Monday' 'Saturday' 'Sunday' 'Thursday' 'Tuesday' 'Wednesday']
weather encoded. Classes: ['Rainy' 'Sunny']
peak_hours encoded. Classes: ['No' 'Yes']
weekends encoded. Classes: ['No' 'Yes']
holidays encoded. Classes: ['No' 'Yes']
Time_value converted to minutes since midnight.
Sample time_value values: [240, 240, 240, 240, 240]
Training feature order: ['day', 'weather', 'time_value', 'peak_hours', 'weekends', 'holidays', 'date_ordinal']
Features and target split successfully.
Training set size: (400000, 7), Testing set size: (100000, 7)
Model initialized with hyperparameters.
Early stopping not supported, falling back to core API: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'
Model trained with core API fallback.
Tr