In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
import joblib
import matplotlib.pyplot as plt

# Define custom scorer for MAPE
def mape_scorer(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_mask = y_true != 0
    return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100

# Make scorers from custom scoring functions
mape = make_scorer(mape_scorer, greater_is_better=False)

class TqdmRandomizedSearchCV(RandomizedSearchCV):
    def __init__(self, estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None,
                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
                 random_state=None, error_score=np.nan, return_train_score=False):
        super().__init__(
            estimator=estimator, param_distributions=param_distributions, n_iter=n_iter,
            scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose,
            pre_dispatch=pre_dispatch, random_state=random_state, error_score=error_score,
            return_train_score=return_train_score)
        
    def fit(self, X, y, **fit_params):
        with tqdm(total=self.n_iter, desc="RandomizedSearchCV Progress") as self._tqdm:
            return super().fit(X, y, **fit_params)

    def _run_search(self, evaluate_candidates):
        """ Use tqdm with RandomizedSearchCV """
        super()._run_search(evaluate_candidates)

# Load the dataset
df_raw = pd.read_csv('Train.csv')

# Convert the 'date_time' column to datetime and sort the dataset
df_raw['date_time'] = pd.to_datetime(df_raw['date_time'])
df_raw.sort_values('date_time', inplace=True)
df_raw.set_index('date_time', inplace=True)

# Convert 'is_holiday' from categorical to binary (1 for any holiday, 0 for non-holiday)
df_raw['is_holiday'] = df_raw['is_holiday'].apply(lambda x: 0 if x == 'None' else 1)

# Extract hour from the 'date_time' column
df_raw['hour'] = df_raw.index.hour

# One-hot encode categorical features
df = pd.get_dummies(df_raw, columns=['weather_type', 'weather_description'], drop_first=True)

# Feature engineering: create lagged and rolling features
target = 'traffic_volume'
for i in range(1, 4):
    df[f'traffic_volume_lag_{i}'] = df[target].shift(i)
df['traffic_volume_rolling_mean'] = df[target].rolling(window=3).mean().shift(1)
df['traffic_volume_rolling_std'] = df[target].rolling(window=3).std().shift(1)

# Remove rows with NaN values resulting from lagged features
df.dropna(inplace=True)

# Split the dataset into features and the target
X = df.drop(target, axis=1)
y = df[target]

# Split the dataset into numerical and categorical features for scaling
X_numerical = df.drop(list(df.filter(regex='is_holiday')), axis=1)
X_categorical = df.filter(regex='is_holiday')

# Scale the numerical features
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

# Combine scaled numerical and categorical features
X_scaled = np.concatenate((X_numerical_scaled, X_categorical), axis=1)

# Split the data into train and test sets
total_samples = X_scaled.shape[0]
split_index = int(total_samples * 0.9)

X_train = X_scaled[:split_index]
y_train = y.iloc[:split_index]
X_test = X_scaled[split_index:]
y_test = y.iloc[split_index:]

# Initialize the XGBoost regressor
xgboost_model = xgb.XGBRegressor(objective='reg:squarederror')

# Define the parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.5, 1],
    'reg_alpha': [0, 0.1, 1, 10]
}

# Update the scoring parameter with a dictionary of metrics
scoring = {
    'RMSE': 'neg_root_mean_squared_error',
    'MSE': 'neg_mean_squared_error',
    'MAE': 'neg_mean_absolute_error',
    'MAPE': mape
}

# Create a TimeSeriesSplit cross-validator
tscv = TimeSeriesSplit(n_splits=5)

# Initialize the TqdmRandomizedSearchCV object with the time series cross-validator
random_search = TqdmRandomizedSearchCV(
    estimator=xgboost_model,
    param_distributions=param_distributions,
    n_iter=100,
    scoring=scoring,
    refit='RMSE',
    cv=tscv,
    verbose=1,
    random_state=42
)

# Fit the random search to the scaled data
random_search.fit(X_train, y_train)

# Get the best parameters
best_parameters = random_search.best_params_
print(f"Best parameters found: {best_parameters}")

# Train the best model on the scaled data
best_model = random_search.best_estimator_

# Evaluate the model using time series cross-validation on the training data
train_rmse_list = []
train_mae_list = []
train_r2_list = []
train_mape_list = []

for train_index, test_index in tscv.split(X_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    best_model.fit(X_train_fold, y_train_fold)
    y_pred_train_fold = best_model.predict(X_train_fold)

    rmse_train_fold = np.sqrt(mean_squared_error(y_train_fold, y_pred_train_fold))
    mae_train_fold = mean_absolute_error(y_train_fold, y_pred_train_fold)
    r2_train_fold = r2_score(y_train_fold, y_pred_train_fold)
    mape_train_fold = mape_scorer(y_train_fold, y_pred_train_fold)

    train_rmse_list.append(rmse_train_fold)
    train_mae_list.append(mae_train_fold)
    train_r2_list.append(r2_train_fold)
    train_mape_list.append(mape_train_fold)

# Print the average metrics over all folds for training data
print("Average Training Metrics:")
print(f"RMSE: {np.mean(train_rmse_list)}")
print(f"MAE: {np.mean(train_mae_list)}")
print(f"R-squared: {np.mean(train_r2_list)}")
print(f"MAPE: {np.mean(train_mape_list)}")

# Evaluate the model using metrics on the validation data
y_pred_val = best_model.predict(X_train[test_index])

# Calculate the metrics for the validation set
rmse_val = np.sqrt(mean_squared_error(y_train[test_index], y_pred_val))
mae_val = mean_absolute_error(y_train[test_index], y_pred_val)
r2_val = r2_score(y_train[test_index], y_pred_val)
mape_val = mape_scorer(y_train[test_index], y_pred_val)

# Print the metrics for the validation set
print("\nMetrics for Validation Set:")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"R-squared: {r2_val}")
print(f"MAPE: {mape_val}")