In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor 
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
from sklearn.impute import SimpleImputer
df = pd.read_csv('data/atmFeatures.csv')

In [25]:
X = df.drop(columns=['Cash_Demand_Next_Day', 'ATM_ID', 'Date'])
Y = df['Cash_Demand_Next_Day']

# Define categorical columns for encoding
categorical_cols = ['Day_of_Week', 'Time_of_Day', 'Location_Type', 'Weather_Condition']

# Define numeric columns for scaling
numeric_cols = ['Total_Withdrawals', 'Total_Deposits', 'Holiday_Flag', 'Special_Event_Flag', 
               'Previous_Day_Cash_Level', 'Nearby_Competitor_ATMs', 'Cash_Demand_Lag_2', 
               'Cash_Demand_Lag_7', 'Cash_Demand_MA_3', 'Cash_Demand_MA_7', 'Withdrawals_MA_7', 'Net_Cash_Flow', 'Withdrawal_to_Deposit_Ratio', 'Cash_Level_Change', 'Cash_Utilization_Rate']

# Split dataset into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create preprocessor with imputer for handling NaN values
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Handle NaN values with mean imputation
            ('scaler', StandardScaler())
        ]), numeric_cols)
    ]
)

# Create model pipelines
models = {
    'RandomForest': Pipeline([('preprocessor', preprocessor), 
                             ('regressor', RandomForestRegressor(random_state=42))]),
    
    'XGBoost': Pipeline([('preprocessor', preprocessor), 
                        ('regressor', XGBRegressor(random_state=42, eval_metric='rmse'))]),
    
    'LinearRegression': Pipeline([('preprocessor', preprocessor), 
                                 ('regressor', LinearRegression())])
}

# Train and evaluate all models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    results[name] = {'MAE': mae, 'R2': r2}
    
    print(f"{name} - MAE: {mae:.3f}, R^2: {r2:.3f}")
    
    # Save each model
    joblib.dump(model, f'atm_cash_demand_model_{name.lower()}.joblib')

# Find best model based on R²
best_model = max(results.keys(), key=lambda x: results[x]['R2'])
print(f"\nBest model: {best_model} (R² = {results[best_model]['R2']:.3f})")

Training RandomForest...
RandomForest - MAE: 4853.136, R^2: 0.866
Training XGBoost...
XGBoost - MAE: 4899.299, R^2: 0.861
Training LinearRegression...
LinearRegression - MAE: 4596.261, R^2: 0.884

Best model: LinearRegression (R² = 0.884)
