# Model selection

- In this notebook, I will use RandomForest, AdaBoost, and XGBoost for model training.
- I will compare their RMSE values and decide the best model to work with.
- The conclusion is that XGBoost performs the best in term of getting the smallest RMSE value.

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error, mean_squared_error, make_scorer
from sklearn.tree import DecisionTreeRegressor

# load the MA data
MA_AP_all_data = pd.read_csv('../data/Massachusetts/AP_data_combined_18_22.csv')

# Drop unnecessary columns
data_MA_inference = MA_AP_all_data.drop(columns=['COUNTY', 'District Code', 'Year', 'Tests Taken'])

# Define outcome and features
outcome = data_MA_inference.columns[0]       # Use the first column as the outcome
features = data_MA_inference.columns[1:]     # All other columns are features

X = data_MA_inference[features]
y = data_MA_inference[outcome]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=216)

# First, get the best RMSE from RandomForestRegressor

In [2]:
print('RandomForestRegressor')

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [4, 6, 8, 10],
    'max_features': ['sqrt', 'log2', None],
    'random_state': [216]
}

# Define a custom RMSE scoring function
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Define the scoring function using the custom rmse function
neg_rmse_scorer = make_scorer(
    rmse, greater_is_better=False
)

# Initialize the RandomForestRegressor
rf_model = RandomForestRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring=neg_rmse_scorer,
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit the GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best RMSE (note that it's negative due to the scoring function)
best_neg_rmse = grid_search.best_score_
best_rmse = -best_neg_rmse  # Convert to positive RMSE

# Get the best parameters
best_params = grid_search.best_params_

print(f'Best RMSE from Grid Search: {best_rmse:.4f}')
print(f'Best Parameters: {best_params}')

# Train the model with the best parameters on the full training data
best_rf_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_rf_model.predict(X_test)

# Compute RMSE on the test set
test_rmse = rmse(y_test, y_pred)
print(f'Test RMSE with Best Model: {test_rmse:.4f}')

RandomForestRegressor
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best RMSE from Grid Search: 8.9723
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 500, 'random_state': 216}
Test RMSE with Best Model: 7.9274


# Second, get the best RMSE from AdaBoostRegressor

In [3]:
print('AdaBoostRegressor')

# Initialize the AdaBoostRegressor with a base estimator
ada_model = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(),
    random_state=216
)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 1.0],
    'loss': ['linear', 'square', 'exponential'],
    'estimator__max_depth': [1, 2, 3, 4, 5]  # Use estimator__ to set parameters of the base estimator
}

# Initialize GridSearchCV with the built-in RMSE scorer
grid_search = GridSearchCV(
    estimator=ada_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit the GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best RMSE (note that it's negative due to the scoring function)
best_neg_rmse = grid_search.best_score_
best_rmse = -best_neg_rmse  # Convert to positive RMSE

# Get the best parameters
best_params = grid_search.best_params_

print(f'Best RMSE from Grid Search: {best_rmse:.4f}')
print(f'Best Parameters: {best_params}')

# Train the model with the best parameters on the full training data
best_ada_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_ada_model.predict(X_test)

# Compute RMSE on the test set using root_mean_squared_error
test_rmse = root_mean_squared_error(y_test, y_pred)
print(f'Test RMSE with Best Model: {test_rmse:.4f}')

AdaBoostRegressor
Fitting 5 folds for each of 300 candidates, totalling 1500 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best RMSE from Grid Search: 9.7597
Best Parameters: {'estimator__max_depth': 5, 'learning_rate': 0.05, 'loss': 'exponential', 'n_estimators': 500}
Test RMSE with Best Model: 9.3285


# Third, get the best RMSE with XGBRegressor

In [4]:
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'random_state': [216],
    'tree_method': ['hist']
}

# Initialize the XGBRegressor
xgb_model = XGBRegressor()

# Initialize GridSearchCV with the built-in RMSE scorer
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit the GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best RMSE (note that it's negative due to the scoring function)
best_neg_rmse = grid_search.best_score_
best_rmse = -best_neg_rmse  # Convert to positive RMSE

# Get the best parameters
best_params = grid_search.best_params_

print(f'Best RMSE from Grid Search: {best_rmse:.4f}')
print(f'Best Parameters: {best_params}')

# Train the model with the best parameters on the full training data
best_xgb_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_xgb_model.predict(X_test)

# Compute RMSE on the test set using root_mean_squared_error
test_rmse = root_mean_squared_error(y_test, y_pred)
print(f'Test RMSE with Best Model: {test_rmse:.4f}')

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best RMSE from Grid Search: 8.9151
Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'random_state': 216, 'subsample': 0.6, 'tree_method': 'hist'}
Test RMSE with Best Model: 7.8134


- The RMSE values for the three models are:

| Model               | RMSE          |
| ------------------- | ------------- |
| Adaboost model      | 9.329         |
| Random Forest model | 7.927         |
| XGBoost model       | 7.813         |


As a result, we will be using XGBoost for the MA data analysis.