# With Weather Data - Using Base XgBoost Model

In [18]:
import pickle

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define file paths
folder = "clean_data/"
file_path1 = "train.csv"
file_path2 = "test.csv"

# Load the CSV files into Pandas DataFrames
train_df = pd.read_csv(folder + file_path1)
test_df = pd.read_csv(folder + file_path2)

# Split the train and test datasets into features and target variable
X_train = train_df.drop(columns=["On", "Off"])
y_train = train_df["On"]
X_test = test_df.drop(columns=["On", "Off"])
y_test = test_df["On"]

# Initialize XGBoost regressor
xgb_reg = xgb.XGBRegressor(random_state=42, verbosity=2)

# Train the model
xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)

In [19]:

# Make predictions
y_pred = np.floor(xgb_reg.predict(X_train)).astype(int)
r2 = r2_score(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mae = mean_absolute_error(y_train, y_pred)
print(f"train rmse: {rmse}, mae: {mae}, r2: {r2}")

y_pred = np.floor(xgb_reg.predict(X_test)).astype(int)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"test rmse: {rmse}, mae: {mae}, r2: {r2}")

pickle.dump(xgb_reg, open("models/base_xgboost_with_weather.pkl", "wb"))

train rmse: 26.297237310405524, mae: 8.858496868925462, r2: 0.7192590653982296
test rmse: 25.880109060143795, mae: 9.397562238620713, r2: 0.7128963222062918


# With Weather Data - Using Fine Tuning on XgBoost Model

In [10]:
import numpy as np
import polars as pl
import pickle
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error, \
    mean_squared_log_error, median_absolute_error, r2_score, mean_poisson_deviance, mean_gamma_deviance, \
    mean_absolute_percentage_error, mean_squared_error

# from google.colab import drive

# Mount Google Drive
# drive.mount('/content/drive')

folder = "clean_data/"
file_path1 = "train.csv"
file_path2 = "test.csv"

# Load the CSV file into a Polars DataFrame
train_df = pd.read_csv(folder + file_path1)
test_df = pd.read_csv(folder + file_path2)

# Convert Polars LazyFrame to pandas DataFrame
# train_df = train_df.collect()
# test_df = test_df.collect().to_pandas()

# Split the Train dataset into features and target variable
X_train = train_df.drop(columns=["On", "Off"])
y_train = train_df["On"]

# Split the test dataset into features and target variable
X_test = test_df.drop(columns=["On","Off"])
y_test = test_df["On"]


### Using Grid Search to find the Best Hyperparameters

In [12]:
# Initialize XGBoost regressor
xgb_reg = xgb.XGBRegressor(random_state=42, verbosity=1)

# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, 
                           cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

### Using the Best Model after tuning the hyperparameters for Prediction

In [None]:
# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
# Make predictions
y_pred = np.floor(best_model.predict(X_train)).astype(int)
r2 = r2_score(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mae = mean_absolute_error(y_train, y_pred)
print(f"train rmse: {rmse}, mae: {mae}, r2: {r2}")

y_pred = np.floor(best_model.predict(X_test)).astype(int)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"test rmse: {rmse}, mae: {mae}, r2: {r2}")

pickle.dump(best_model, open("models/tuned_xgboost_with_weather.pkl", "wb"))

[CV] END ......learning_rate=0.001, max_depth=3, subsample=1; total time= 1.3min
[CV] END ......learning_rate=0.001, max_depth=5, subsample=1; total time= 2.0min
[CV] END .....learning_rate=0.01, max_depth=7, subsample=0.7; total time= 2.3min
[CV] END ....learning_rate=0.001, max_depth=5, subsample=0.7; total time= 2.1min
[CV] END .....learning_rate=0.01, max_depth=7, subsample=0.7; total time= 2.3min
[CV] END ....learning_rate=0.001, max_depth=5, subsample=0.7; total time= 2.1min
[CV] END ....learning_rate=0.001, max_depth=3, subsample=0.5; total time= 1.4min
[CV] END ......learning_rate=0.001, max_depth=5, subsample=1; total time= 2.0min
[CV] END .......learning_rate=0.01, max_depth=7, subsample=1; total time= 2.2min
[CV] END ....learning_rate=0.001, max_depth=5, subsample=0.5; total time= 2.1min
[CV] END ....learning_rate=0.001, max_depth=3, subsample=0.7; total time= 1.4min
[CV] END ......learning_rate=0.001, max_depth=5, subsample=1; total time= 2.0min
[CV] END .....learning_rate=

# Without Weather data - Using Base Xgboost Model

In [15]:

# Define file paths
folder = "clean_data/"
file_path1 = "train_wo_weather.csv"
file_path2 = "test_wo_weather.csv"

# Load the CSV files into Pandas DataFrames
train_df = pd.read_csv(folder + file_path1)
test_df = pd.read_csv(folder + file_path2)

# Split the train and test datasets into features and target variable
X_train = train_df.drop(columns=["On", "Off"])
y_train = train_df["On"]
X_test = test_df.drop(columns=["On", "Off"])
y_test = test_df["On"]

# Initialize XGBoost regressor
xgb_reg = xgb.XGBRegressor(random_state=42, verbosity=2)

# Train the model
xgb_reg.fit(X_train, y_train)


# Make predictions
y_pred = np.floor(xgb_reg.predict(X_train)).astype(int)
r2 = r2_score(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mae = mean_absolute_error(y_train, y_pred)
print(f"train rmse: {rmse}, mae: {mae}, r2: {r2}")

y_pred = np.floor(xgb_reg.predict(X_test)).astype(int)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"test rmse: {rmse}, mae: {mae}, r2: {r2}")

pickle.dump(xgb_reg, open("models/base_xgboost_wo_weather.pkl", "wb"))

train rmse: 26.523086229898002, mae: 8.759609837032043, r2: 0.7144161758464848
test rmse: 25.600350158112466, mae: 9.023667829585897, r2: 0.7190698424793321


# Without Weather data - Using Fine Tuned Xgboost Model

In [16]:
# Define file paths
folder = "clean_data/"
file_path1 = "train_wo_weather.csv"
file_path2 = "test_wo_weather.csv"

# Load the CSV files into Pandas DataFrames
train_df = pd.read_csv(folder + file_path1)
test_df = pd.read_csv(folder + file_path2)

# Split the train and test datasets into features and target variable
X_train = train_df.drop(columns=["On", "Off"])
y_train = train_df["On"]
X_test = test_df.drop(columns=["On", "Off"])
y_test = test_df["On"]

# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

# Initialize XGBoost regressor
xgb_reg = xgb.XGBRegressor(random_state=42, verbosity=1)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, 
                           cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)


# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
# Make predictions
y_pred = np.floor(best_model.predict(X_train)).astype(int)
r2 = r2_score(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mae = mean_absolute_error(y_train, y_pred)
print(f"train rmse: {rmse}, mae: {mae}, r2: {r2}")

y_pred = np.floor(best_model.predict(X_test)).astype(int)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"test rmse: {rmse}, mae: {mae}, r2: {r2}")

pickle.dump(best_model, open("models/tuned_xgboost_wo_weather.pkl", "wb"))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END ........learning_rate=0.1, max_depth=3, subsample=1; total time= 1.6min
[CV] END ........learning_rate=0.1, max_depth=5, subsample=1; total time= 2.2min
[CV] END .......learning_rate=0.01, max_depth=3, subsample=1; total time= 1.4min




[CV] END ........learning_rate=0.1, max_depth=3, subsample=1; total time= 1.6min
[CV] END ........learning_rate=0.1, max_depth=5, subsample=1; total time= 2.1min
[CV] END .....learning_rate=0.01, max_depth=3, subsample=0.7; total time= 1.5min
[CV] END ......learning_rate=0.1, max_depth=5, subsample=0.5; total time= 2.2min
[CV] END .....learning_rate=0.01, max_depth=3, subsample=0.5; total time= 1.7min
[CV] END .......learning_rate=0.01, max_depth=3, subsample=1; total time= 1.3min
[CV] END ......learning_rate=0.1, max_depth=5, subsample=0.5; total time= 2.2min
[CV] END .....learning_rate=0.01, max_depth=3, subsample=0.7; total time= 1.7min
[CV] END .......learning_rate=0.01, max_depth=3, subsample=1; total time= 1.4min
[CV] END ......learning_rate=0.1, max_depth=5, subsample=0.5; total time= 2.2min
[CV] END .....learning_rate=0.01, max_depth=3, subsample=0.5; total time= 1.8min
[CV] END .....learning_rate=0.01, max_depth=5, subsample=0.5; total time= 1.9min
[CV] END ......learning_rate