In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import xgboost

df = pd.read_pickle(r'pickles/df4.pkl')


In [2]:
#model selection using regression models

y=df['New Deaths']
X = df.drop(columns=['New Deaths'])



from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

# --- metrics function ---
def regressionMetrics(y, yhat):
    res = {
        'MSE': metrics.mean_squared_error(y, yhat),
        'RMSE': np.sqrt(metrics.mean_squared_error(y, yhat)),
        'MAE': metrics.mean_absolute_error(y, yhat),
        #'RMSLE': np.sqrt(metrics.mean_squared_log_error(y, yhat)), removed due to -1 error
        'R2': metrics.r2_score(y, yhat)
    }
    return res

# --- models dictionary ---
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "SVR": SVR(),
    "XGBRegressor": xgboost.XGBRegressor(objective="reg:squarederror")
}

# --- train, predict, and evaluate on DEV set ---
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_dev_pred = model.predict(X_dev)
    m = regressionMetrics(y_dev, y_dev_pred)
    results.append({"Model": name, **m})

# --- results table ---
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Model,MSE,RMSE,MAE,R2
0,LinearRegression,4963.428084,70.451601,34.314958,0.468188
1,DecisionTreeRegressor,1626.19107,40.326059,8.852922,0.82576
2,RandomForestRegressor,935.848452,30.59164,7.542226,0.899728
3,AdaBoostRegressor,4289.087993,65.491129,58.54692,0.540441
4,GradientBoostingRegressor,1048.414422,32.379228,8.963344,0.887667
5,SVR,9334.276407,96.614059,21.176847,-0.000131
6,XGBRegressor,949.064758,30.806895,7.448216,0.898311


In [9]:
df_results.to_excel('output1.xlsx')

In [3]:
#find best parameters for xgboost

import numpy as np
import xgboost

# --- parameter grid for XGBoost ---
xgb_param_grid = {
    # number of boosting rounds (trees)
    'n_estimators': [100, 200, 300, 500, 800, 1000],
    
    # learning rate (controls contribution of each tree)
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    
    # max depth of each tree
    'max_depth': [3, 5, 7, 9, 11],
    
    # minimum sum of instance weight (hessian) needed in a child
    'min_child_weight': [1, 3, 5, 7],
    
    # subsample ratio of the training instances (stochastic sampling)
    'subsample': [0.6, 0.8, 1.0],
    
    # subsample ratio of columns when constructing each tree
    'colsample_bytree': [0.6, 0.8, 1.0],
    
    # L2 regularization term
    'reg_lambda': [0.1, 1.0, 5.0, 10.0],
    
    # L1 regularization term
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    
    # type of booster (usually “gbtree” is best for regression)
    'booster': ['gbtree'],
    
    # random seed for reproducibility
    'random_state': [42]
}

xgb_param_grid


{'n_estimators': [100, 200, 300, 500, 800, 1000],
 'learning_rate': [0.01, 0.05, 0.1, 0.2],
 'max_depth': [3, 5, 7, 9, 11],
 'min_child_weight': [1, 3, 5, 7],
 'subsample': [0.6, 0.8, 1.0],
 'colsample_bytree': [0.6, 0.8, 1.0],
 'reg_lambda': [0.1, 1.0, 5.0, 10.0],
 'reg_alpha': [0, 0.1, 0.5, 1.0],
 'booster': ['gbtree'],
 'random_state': [42]}

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

xgb = XGBRegressor(objective='reg:squarederror')

xgb_random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=xgb_param_grid,
    n_iter=50,                    # number of random combinations
    scoring='r2',                 # use R² for evaluation
    cv=3,                         # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1                     # use all cores
)

xgb_random_search.fit(X_train, y_train)
print("Best parameters found: ", xgb_random_search.best_params_)
print("Best R² Score: ", xgb_random_search.best_score_)


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found:  {'subsample': 1.0, 'reg_lambda': 10.0, 'reg_alpha': 0, 'random_state': 42, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.01, 'colsample_bytree': 0.6, 'booster': 'gbtree'}
Best R² Score:  0.8465584715207418
