# Model Training

This pre-processing recieves a .xlsx file and returns a processed.xlsx file. 

In [1]:
import os
os.chdir('..')

In [33]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from src.data_preprocessing import df_construct, add_eng_values, alter_term_gender
from src.model_training import build_preprocessor, build_full_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from src.model_training import save_model

In [3]:
excel_file = pd.ExcelFile('data/raw_data/D2lData.xlsx')

# Reading each sheet into a DataFrame
df_d2l = pd.read_excel(excel_file, 'd2l')
df_demo = pd.read_excel(excel_file, 'demographics')
df_grades = pd.read_excel(excel_file, 'grades')

In [4]:
df = df_construct(df_d2l, df_demo, df_grades)

In [5]:
# Save this cleaned dataframe to the data/processed_data folder for future use.
df.to_csv('data/processed_data/df_cleaned.csv', index=False)

In [6]:
# Use this code in the event that it's needed
# df = pd.read_csv('data/processed_data/df_cleaned.csv')

In [7]:
X = df.drop('at_risk', axis=1)
y = df.at_risk

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

In [10]:
preprocessor = build_preprocessor(numeric_features, categorical_features)

## Model 1: RandomForestClassifier

In [11]:
model_rfc = RandomForestClassifier()
pipeline_rfc = build_full_pipeline(preprocessor, model_rfc)

In [12]:
# Fit the pipeline to your training data
pipeline_rfc.fit(X_train, y_train)

In [13]:
# Predictions
y_pred_rfc = pipeline_rfc.predict(X_test)

### Evaluate RandomForestClassifier

#### Here we will check the accuracy, precision, recall, and f1-score, along with a confusion matrix

In [14]:
# Evaluate the model
accuracy_rfc = accuracy_score(y_test, y_pred_rfc)
classification_rep_rfc = classification_report(y_test, y_pred_rfc)
conf_matrix_rfc = confusion_matrix(y_test, y_pred_rfc)

print(f"Accuracy: {accuracy_rfc}")
print("Classification Report:\n", classification_rep_rfc)
print('\nConfusion Matrix:\n', conf_matrix_rfc)

Accuracy: 0.9635193133047211
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1322
           1       0.90      0.37      0.52        76

    accuracy                           0.96      1398
   macro avg       0.93      0.68      0.75      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Matrix:
 [[1319    3]
 [  48   28]]


## Model 2: GradientBoostingClassifier

In [15]:
model_gbc = GradientBoostingClassifier()
pipeline_gbc = build_full_pipeline(preprocessor, model_gbc)

In [16]:
# Fit the pipeline to your training data
pipeline_gbc.fit(X_train, y_train)

In [17]:
# Predictions
y_pred_gbc = pipeline_gbc.predict(X_test)

### Evaluate GradientBoostingClassifier

#### Here we will check the accuracy, precision, recall, and f1-score, along with a confusion matrix

In [18]:
# Evaluate the model
accuracy_gbc = accuracy_score(y_test, y_pred_gbc)
classification_rep_gbc = classification_report(y_test, y_pred_gbc)
conf_matrix_gbc = confusion_matrix(y_test, y_pred_gbc)

print(f"Accuracy: {accuracy_gbc}")
print("Classification Report:\n", classification_rep_gbc)
print('\nConfusion Matrix:\n', conf_matrix_gbc)

Accuracy: 0.9670958512160229
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      1322
           1       0.84      0.49      0.62        76

    accuracy                           0.97      1398
   macro avg       0.91      0.74      0.80      1398
weighted avg       0.96      0.97      0.96      1398


Confusion Matrix:
 [[1315    7]
 [  39   37]]


## Observations

- We can see that, while the accuracy is very high, we do not have the recall values we'd like to see for the under-represented group.
- In this instance, since we aim to identify at-risk students, we would be willing to risk having more False Positives in order to minimize on the False Negatives.  In other words, it's far less problematic to have a student who is not at-risk be incorrectly identified as at risk than it is to have students who are actually at-risk not identified.
- Let's check some other classification models to find out how they perform out of the box on this imbalanced dataset.


## Testing Other Classifiers

#### We've also included a dictionary that logs which entries(rows) are False Negatives for each of the classifiers.  Since this dataset is quite imbalanced, we will need to try various techniques in order to increase the recall rate.  By collecting the false negatives, we can hopefully get an idea of why these entries are difficult to classify correctly.

In [38]:
# Define classifiers
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(),
    'RidgeRegression': RidgeClassifier(),
    'NaiveBayes': GaussianNB(),
    'NeuralNetwork': MLPClassifier(),
    'XGBoost': xgb.XGBClassifier()
    
}

# Create an empty dictionary to store False Negatives for each classifier
false_negatives_dict = {}

# Create an empty list to store DataFrames for each classifier
false_negatives_dfs = []

# Iterate through classifiers
for clf_name, clf in classifiers.items():
    pipeline = build_full_pipeline(preprocessor, clf)
    grid_search = GridSearchCV(pipeline, {}, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_test)

    # Identify False Negatives
    false_negatives = (y_test == 1) & (y_pred == 0)

    # Extract indices of False Negatives
    fn_indices = np.where(false_negatives)[0]
    
    # Extract False Negatives entries from the DataFrame
    fn_entries = X_test[false_negatives]

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Append False Negatives entries to the list
    fn_entries['Classifier'] = clf_name
    false_negatives_dfs.append(fn_entries)
    
    # Store False Negatives indices in the dictionary
    false_negatives_dict[clf_name] = fn_indices

    print(f"{clf_name}\n")
    print(f"Accuracy: {accuracy}\n")
    print("Classification Report:\n", classification_rep)
    print('Confusion Matrix:\n', conf_matrix)
    print(f'False Negatives Indices: {fn_indices}')
    print("------------------------------------------------------------------\n")

# Concatenate all DataFrames into a single DataFrame
false_negatives_df = pd.concat(false_negatives_dfs, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fn_entries['Classifier'] = clf_name


RandomForest

Accuracy: 0.9656652360515021

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1322
           1       0.94      0.39      0.56        76

    accuracy                           0.97      1398
   macro avg       0.95      0.70      0.77      1398
weighted avg       0.96      0.97      0.96      1398

Confusion Matrix:
 [[1320    2]
 [  46   30]]
False Negatives Indices: [   3   20   40  132  138  151  178  201  233  307  325  392  412  457
  481  539  542  578  668  696  698  716  734  778  792  794  891  902
  906  962  979 1038 1059 1061 1105 1108 1241 1245 1250 1272 1283 1318
 1327 1341 1378 1396]
------------------------------------------------------------------



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fn_entries['Classifier'] = clf_name


GradientBoosting

Accuracy: 0.9670958512160229

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      1322
           1       0.84      0.49      0.62        76

    accuracy                           0.97      1398
   macro avg       0.91      0.74      0.80      1398
weighted avg       0.96      0.97      0.96      1398

Confusion Matrix:
 [[1315    7]
 [  39   37]]
False Negatives Indices: [   3   20   40  138  151  178  201  233  325  392  412  457  481  539
  542  583  668  696  698  716  734  778  792  794  962  979 1059 1061
 1108 1139 1245 1250 1272 1283 1318 1327 1341 1378 1396]
------------------------------------------------------------------



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fn_entries['Classifier'] = clf_name


SVM

Accuracy: 0.949928469241774

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      1322
           1       1.00      0.08      0.15        76

    accuracy                           0.95      1398
   macro avg       0.97      0.54      0.56      1398
weighted avg       0.95      0.95      0.93      1398

Confusion Matrix:
 [[1322    0]
 [  70    6]]
False Negatives Indices: [   3   13   20   30   40   41  132  138  151  178  201  233  279  307
  309  325  389  392  409  412  417  457  481  527  539  542  558  578
  583  668  678  696  698  716  734  778  792  794  848  891  894  902
  906  916  962  979 1038 1059 1061 1069 1096 1103 1105 1108 1117 1139
 1179 1241 1245 1249 1250 1272 1275 1283 1318 1327 1341 1346 1378 1396]
------------------------------------------------------------------



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fn_entries['Classifier'] = clf_name


KNN

Accuracy: 0.9535050071530758

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      1322
           1       0.64      0.33      0.43        76

    accuracy                           0.95      1398
   macro avg       0.80      0.66      0.71      1398
weighted avg       0.94      0.95      0.95      1398

Confusion Matrix:
 [[1308   14]
 [  51   25]]
False Negatives Indices: [   3   13   20   30   40  132  138  151  178  201  233  307  325  392
  409  412  457  481  527  539  542  583  668  696  698  716  734  778
  792  794  891  902  906  962  979 1038 1059 1061 1103 1105 1108 1241
 1245 1250 1272 1283 1318 1327 1341 1378 1396]
------------------------------------------------------------------



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fn_entries['Classifier'] = clf_name


LogisticRegression

Accuracy: 0.9556509298998569

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      1322
           1       0.71      0.32      0.44        76

    accuracy                           0.96      1398
   macro avg       0.83      0.65      0.71      1398
weighted avg       0.95      0.96      0.95      1398

Confusion Matrix:
 [[1312   10]
 [  52   24]]
False Negatives Indices: [   3   20   30   40  132  138  151  178  201  233  307  325  392  409
  412  457  481  527  539  542  578  583  696  698  716  734  778  792
  794  848  891  902  962  979 1059 1061 1103 1105 1108 1117 1241 1245
 1249 1250 1272 1283 1318 1327 1341 1346 1378 1396]
------------------------------------------------------------------



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fn_entries['Classifier'] = clf_name


RidgeRegression

Accuracy: 0.9556509298998569

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      1322
           1       0.73      0.29      0.42        76

    accuracy                           0.96      1398
   macro avg       0.85      0.64      0.70      1398
weighted avg       0.95      0.96      0.95      1398

Confusion Matrix:
 [[1314    8]
 [  54   22]]
False Negatives Indices: [   3   20   40  132  138  151  178  201  233  307  309  325  392  409
  412  457  481  527  539  542  578  583  668  696  698  716  734  778
  792  794  848  891  902  962  979 1038 1059 1061 1103 1105 1108 1139
 1179 1241 1245 1249 1250 1272 1283 1318 1327 1341 1378 1396]
------------------------------------------------------------------



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fn_entries['Classifier'] = clf_name


NaiveBayes

Accuracy: 0.5572246065808297

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.54      0.70      1322
           1       0.10      0.86      0.17        76

    accuracy                           0.56      1398
   macro avg       0.54      0.70      0.44      1398
weighted avg       0.94      0.56      0.67      1398

Confusion Matrix:
 [[714 608]
 [ 11  65]]
False Negatives Indices: [ 392  440  583  673 1059 1061 1241 1245 1249 1250 1318]
------------------------------------------------------------------



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fn_entries['Classifier'] = clf_name


NeuralNetwork

Accuracy: 0.9670958512160229

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      1322
           1       0.75      0.59      0.66        76

    accuracy                           0.97      1398
   macro avg       0.86      0.79      0.82      1398
weighted avg       0.96      0.97      0.97      1398

Confusion Matrix:
 [[1307   15]
 [  31   45]]
False Negatives Indices: [   3  132  138  178  201  307  325  392  457  527  539  583  668  698
  716  734  778  792  979 1059 1061 1108 1241 1245 1249 1250 1283 1318
 1327 1341 1378]
------------------------------------------------------------------

XGBoost

Accuracy: 0.9721030042918455

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      1322
           1       0.86      0.58      0.69        76

    accuracy                           0.97      1398
   macro avg       0.92  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fn_entries['Classifier'] = clf_name


## Observations
- Again, we see that most models have a very high accuracy, but many have a recall rate of less than 50% for the under-represented class.
- Naive Bayes has the best Recall, but this comes at the cost of having a significant decrease to accuracy and recall for the over-represented class.  While it is unfeasible to misclassify half of the student population as at-risk, maybe we can leverage this model's predictions in an ensemble meta model or something.
- The Nueral Network and XGBoost models are showing some signs of hope as well, with recall values of 62% and 58%, respectively.

### Moving Forward
- We will have to try tuning some of these models, combined with other methods that can deal with imbalanced datasets in our hyperparameter_tuning notebook.
- Before the hyperparameter tuning, we will take a closer look at the dictionary of false negatives we created to determine if we can find out more about the entries that are not being classified properly.  This investigation can be found in the EDA notebook.

In [40]:
# Save false_negatives_dict and df to a file in order to use in the EDA notebook
with open('data/processed_data/false_negatives_dict.pkl', 'wb') as file:
    pickle.dump(false_negatives_dict, file)
    
false_negatives_df.to_csv('data/processed_data/false_negatives_df.csv', index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

## Save The Model

#### If there is any model worth saving, we can pickle it for future use:

In [13]:
# Save the trained model
save_model(full_pipeline, file_path='models/trained_model_rfc_base.pkl')

## Creating the Meta Model

#### This model combines the top 3 models that had the least amount of overlap for false negatives.  This evidence can be found in the EDA notebook.  The three models used here are XGBoost, Naive Bayes, and Neural Net.  From above, we can see that both the Neural Net and XGBoost had a couple of the better recall scores for the under-represented class, except for the Naive Bayes, which had the best recall score overall (at the cost of many False Positives).

#### This first pass will not include any hyperparameter tuning to get a baseline of what we can achieve with this stack.

In [45]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Step 1: Generate Predictions
nb_model = build_full_pipeline(preprocessor,GaussianNB())
nn_model = build_full_pipeline(preprocessor,MLPClassifier())
xgb_model = build_full_pipeline(preprocessor,XGBClassifier())

X_train_base, X_meta, y_train_base, y_meta = train_test_split(X, y, test_size=0.2)

nb_model.fit(X_train_base, y_train_base)
nn_model.fit(X_train_base, y_train_base)
xgb_model.fit(X_train_base, y_train_base)

pred_nb = nb_model.predict(X_meta)
pred_nn = nn_model.predict(X_meta)
pred_xgb = xgb_model.predict(X_meta)

# Step 2: Create Meta-Features
meta_features = np.column_stack((pred_nb, pred_nn, pred_xgb))

# Step 3: Train Meta-Ensemble Model
meta_ensemble_model = RandomForestClassifier()
meta_ensemble_model.fit(meta_features, y_meta)

# Step 4: Make Ensemble Predictions
pred_ensemble_nb = nb_model.predict(X_test)
pred_ensemble_nn = nn_model.predict(X_test)
pred_ensemble_xgb = xgb_model.predict(X_test)

meta_features_test = np.column_stack((pred_ensemble_nb, pred_ensemble_nn, pred_ensemble_xgb))
ensemble_predictions = meta_ensemble_model.predict(meta_features_test)

# Step 5: Evaluate Performance
accuracy = accuracy_score(y_test, ensemble_predictions)
classification_rep = classification_report(y_test, ensemble_predictions)
conf_matrix = confusion_matrix(y_test, ensemble_predictions)
print(f"Accuracy: {accuracy}\n")
print("Classification Report:\n", classification_rep)
print('Confusion Matrix:\n', conf_matrix)



Accuracy: 0.9957081545064378

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1322
           1       0.99      0.93      0.96        76

    accuracy                           1.00      1398
   macro avg       0.99      0.97      0.98      1398
weighted avg       1.00      1.00      1.00      1398

Confusion Matrix:
 [[1321    1]
 [   5   71]]


### Results:
- This is an impressive baseline model with a recall of 93% on the minority class.