# COMPUTER SCIENCE II MODEL

In [None]:
#Importing the libraries and the datasets 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from imblearn.over_sampling import RandomOverSampler
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
df_X_train = pd.read_csv("X_train_cs.csv")
df_X_test = pd.read_csv("X_test_cs.csv")
df_y_train = pd.read_csv("y_train_cs.csv")
df_y_test = pd.read_csv("y_test_cs.csv")
df_y_test = df_y_test.astype(int)
df_y_train = df_y_train.astype(int)

#Transforming to csv creates an index column (Unnamed: 0) which need to be dropped
df_X_train.drop(columns=['Unnamed: 0'], inplace=True)
df_X_test.drop(columns=['Unnamed: 0'], inplace=True)
df_y_test.drop(columns=['Unnamed: 0'], inplace=True)
df_y_train.drop(columns=['Unnamed: 0'], inplace=True)

#Transforming the data
y_train = np.ravel(df_y_train)
y_test = np.ravel(df_y_test)

In [None]:
#Count the values for each class 
df_y_train['Computer science II'].value_counts()
# Example class counts
class_counts = [463, 1637]  # Replace with actual counts from your dataset

In [None]:
#Scaling the data
scaler = StandardScaler()
scaler.fit(df_X_train)
X_train_scaler = scaler.transform(df_X_train)
X_test_scaler = scaler.transform(df_X_test)

In [None]:
#Getting a first set of predictions to use for the evaluation metrics
#Selecting the number of neurons by the rule of thumb: 2/3 of inputs(X) + outputs(y) = 31
mlp = MLPClassifier(random_state = 7)
mlp.fit(X_train_scaler,y_train)
y_pred = mlp.predict(X_test_scaler)
print(classification_report(y_test, y_pred,target_names=['FAIL','PASS']))

In [None]:
#Using GridSearchCV to get finer results for the hyperparameters
#First GridSearch revolves around the rule-of-thumb value for the size of neurons in the hidden layer
mlp_gs = MLPClassifier()
hyperparams = {
    'hidden_layer_sizes' : [(5,),(10,),(15,),(20,),(25,),(30,)],
    'activation': ['tanh','logistic','relu'],
    'solver': ['sgd','adam'],
    'alpha': [0.000001],
    'learning_rate_init': [0.0001,0.001],
    'max_iter' : [1000],
    'early_stopping' : [True],
    'random_state' : [7]
}

#Adjust the positive values to prioritize the outcome where the student fails
scorer = make_scorer(f1_score, pos_label = 0)

gs = GridSearchCV(
    estimator = mlp_gs,
    param_grid = hyperparams,
    cv = 5, 
    scoring = scorer,
    return_train_score = True
)

gs.fit(X_train_scaler,y_train)
gs.best_params_

In [None]:
#Second GridSearch is used for a wider range and test if the rule-of-thumb is efficient for this case
mlp_gs = MLPClassifier()
hyperparams = {
    'hidden_layer_sizes' : [(550,),(600,),(650,),(750,),(800,),(850,),(900,),(950,)],
    'activation': ['tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.000001],
    'learning_rate_init': [0.00001, 0.0001],
    'max_iter' : [1000],
    'early_stopping' : [True],
    'random_state' : [7]
}

#Adjust the positive values to prioritize the outcome where the student fails
scorer = make_scorer(f1_score, pos_label = 0)

gs = GridSearchCV(
    estimator = mlp_gs,
    param_grid = hyperparams,
    cv = 5, 
    scoring = scorer,
    return_train_score = True
)

gs.fit(X_train_scaler,y_train)

In [None]:
res = pd.DataFrame(gs.cv_results_)
res_sorted = res.sort_values(by='rank_test_score',ascending=True)
res_sorted
res_sorted_top = res_sorted[['param_activation','param_alpha','param_hidden_layer_sizes','param_learning_rate_init','param_solver','mean_test_score']].head()
res_sorted_top['f1_score'] = [0.50,0.48,0.47,0.49,0.48]
res_sorted_top['mean_test_score'] = res_sorted_top['mean_test_score'].round(3)
res_sorted_top

In [None]:
#Creating a model with the optimized hyperparameters
mlp_optimized = MLPClassifier(activation = 'tanh',alpha= 0.000001,early_stopping= True,hidden_layer_sizes= (900,),learning_rate_init= 0.00001,max_iter= 1000,random_state = 7,solver= 'adam')
mlp_optimized.fit(X_train_scaler,y_train)
y_pred = mlp_optimized.predict(X_test_scaler)
print(classification_report(y_test, y_pred,target_names=['FAIL','PASS']))

In [None]:
#Performing the over-sampling technique on training data
#Creating new variables for the training data to test as a comparison
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train_scaler, y_train)

In [None]:
#Using GridSearchCV to get finer results for the hyperparameters
mlp_gs = MLPClassifier()
hyperparams = {
    'hidden_layer_sizes' : [(50,),(100,),(250,),(500,),(750,),(1000,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.000001,0.00001,0.0001],
    'learning_rate_init': [0.001],
    'max_iter' : [1000],
    'early_stopping' : [True],
    'random_state' : [7]
}

#Adjust the positive values to prioritize the outcome where the student fails
scorer = make_scorer(f1_score, greater_is_better=True, pos_label = 0)

gs_res = GridSearchCV(
    estimator = mlp_gs,
    param_grid = hyperparams,
    cv = 5, 
    scoring = scorer,
    return_train_score = True
)

gs_res.fit(X_resampled,y_resampled)

In [None]:
res = pd.DataFrame(gs_res.cv_results_)
res_sorted = res.sort_values(by='rank_test_score',ascending=True)
res_sorted
res_sorted_top = res_sorted[['param_activation','param_alpha','param_hidden_layer_sizes','param_learning_rate_init','param_solver','mean_test_score']].head()
res_sorted_top['f1_score'] = [0.43,0.43,0.43,0.45,0.46]
res_sorted_top['mean_test_score'] = res_sorted_top['mean_test_score'].round(3)
res_sorted_top

In [None]:
#Creating a model with the optimized hyperparameters
mlp_resampled = MLPClassifier(activation = 'relu',alpha = 0.000001,early_stopping = True,hidden_layer_sizes = (500,),learning_rate_init = 0.001, max_iter= 1000, random_state = 7, solver= 'adam')
mlp_resampled.fit(X_resampled,y_resampled)
y_pred = mlp_resampled.predict(X_test_scaler)
print(classification_report(y_test, y_pred,target_names=['FAIL','PASS']))

In [None]:
#Using alternative method for comparison - KNeighbors
#This method is set to default values since this is not the technique being studied in this project.
kn = KNeighborsClassifier()
kn.fit(X_train_scaler,y_train)
y_pred_kn = kn.predict(X_test_scaler)
print(classification_report(y_test, y_pred_kn,target_names=['FAIL','PASS']))

In [None]:
#GridSearchCV for KNeighbors
#Using GridSearchCV to get finer results for the hyperparameters
kn = KNeighborsClassifier()
param = {
    'n_neighbors' : [3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
}

#Adjust the positive values to prioritize the outcome where the student fails
scorer = make_scorer(f1_score, pos_label = 0)

gs_kn = GridSearchCV(
    estimator = kn,
    param_grid = param,
    cv = 5, 
    scoring = scorer,
    return_train_score = True
)

gs_kn.fit(X_train_scaler,y_train)

In [None]:
res_kn = pd.DataFrame(gs_kn.cv_results_)
res_sorted_kn = res_kn.sort_values(by='rank_test_score',ascending=True)
res_sorted_kn = res_sorted_kn[['param_n_neighbors','mean_train_score']].head()
res_sorted_kn['f1_score'] = [0.33,0.37,0.26,0.30,0.34]
res_sorted_kn['mean_train_score'] = res_sorted_kn['mean_train_score'].round(3)
res_sorted_kn

In [None]:
#Optimized model for KNeighbors
kn = KNeighborsClassifier(n_neighbors = 5)
kn.fit(X_train_scaler,y_train)
y_pred_kn = kn.predict(X_test_scaler)
print(classification_report(y_test, y_pred_kn,target_names=['FAIL','PASS']))