# Model Training

This pre-processing recieves a .xlsx file and returns a processed.xlsx file. 

In [2]:
import os
os.chdir('..')

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from src.data_preprocessing import df_construct, add_eng_values, alter_term_gender
from src.model_training import build_preprocessor, build_model, build_full_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
excel_file = pd.ExcelFile('data/raw_data/D2lData.xlsx')

# Reading each sheet into a DataFrame
df_d2l = pd.read_excel(excel_file, 'd2l')
df_demo = pd.read_excel(excel_file, 'demographics')
df_grades = pd.read_excel(excel_file, 'grades')

In [5]:
df = df_construct(df_d2l, df_demo, df_grades)

In [6]:
# Save this cleaned dataframe to the data/processed_data folder for future use.
df.to_csv('data/processed_data/df_cleaned.csv', index=False)

In [7]:
# Use this code in the event that it's needed
# df = pd.read_csv('data/processed_data/df_cleaned.csv')

In [8]:
X = df.drop('at_risk', axis=1)
y = df.at_risk

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = build_preprocessor(numeric_features, categorical_features)
model = build_model()
full_pipeline = build_full_pipeline(preprocessor, model)

In [11]:
# Fit the pipeline to your training data
full_pipeline.fit(X_train, y_train)

In [12]:
# Predictions
y_pred = full_pipeline.predict(X_test)

## Evaluate The Model

#### Here we will check the accuracy, precision, recall, and f1-score, along with a confusion matrix

In [13]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)
print('\nConfusion Matrix:\n', conf_matrix)

Accuracy: 0.9656652360515021
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1322
           1       0.97      0.38      0.55        76

    accuracy                           0.97      1398
   macro avg       0.97      0.69      0.76      1398
weighted avg       0.97      0.97      0.96      1398


Confusion Matrix:
 [[1321    1]
 [  47   29]]
