Classification Model to Predict Program Performance Score

In [31]:
import sys
import os
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

In [32]:
x_data = pd.read_csv('x_all.csv')
x = x_data.to_numpy()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
    

In [33]:
def find_x_scaled_sample(sample: list):
    """
    Return index of sample from x_all data
    """
    for i in range(len(x_scaled)):
        found = True
        for j in range(len(sample)):
            if sample[j] != x_scaled[i][j]:
                found = False
        if found:
            return i - 2

In [34]:
y_data = pd.read_csv('y_all.csv')
y = y_data.to_numpy().ravel()

In [35]:
def get_best_model_parameters(model, x, y, cv, parameters):
    grid_search = GridSearchCV(model(), parameters, n_jobs=-1)
    grid_search.fit(x, y)
    print("Best parameters combination found:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print(f"{param_name}: {best_parameters[param_name]}")

    return best_parameters

def train_and_evaluate_model(model, x, y, test_size):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=50)
    model.fit(x_train, y_train)
    print(f"Training Score: {model.score(x_train, y_train)}")
    scores = cross_val_score(model, x_train, y_train)
    print(f"Average Cross Validation Score: {scores.mean()}, Standard Deviation: {scores.std()}")
    print(f"Testing Score: {model.score(x_test, y_test)}")
    predictions = model.predict(x_test)
    for i in range(len(predictions)):
        print(f"Predicted: {predictions[i]}, Actual: {y_test[i]}")
        if predictions[i] != y_test[i]:
            print(find_x_scaled_sample(list(x_test[i])))

In [43]:
NN_parameters = {
    'hidden_layer_sizes': [(10,), (10, 10), (10, 10, 10), (15, 15), (20, 20), (25,), (25, 25), (50,)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.00001, 0.0001, 0.001, 0.01],
    'max_iter': [200, 500, 1000, 2000],
}
NN_best_parameters = get_best_model_parameters(MLPClassifier, x_scaled, y, 5, NN_parameters)

Best parameters combination found:
activation: tanh
alpha: 0.001
hidden_layer_sizes: (25, 25)
max_iter: 1000
solver: lbfgs


In [44]:
NN_best = MLPClassifier(
    activation=NN_best_parameters['activation'],
    alpha=NN_best_parameters['alpha'],
    hidden_layer_sizes=NN_best_parameters['hidden_layer_sizes'],
    solver=NN_best_parameters['solver'],
    max_iter=NN_best_parameters['max_iter'] 
)

train_and_evaluate_model(NN_best, x_scaled, y, 0.2)

Training Score: 1.0
Average Cross Validation Score: 0.7764705882352941, Standard Deviation: 0.0685994340570035
Testing Score: 0.6818181818181818
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 2, Actual: 3
40
Predicted: 3, Actual: 4
23
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 3, Actual: 4
73
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 1, Actual: 2
45
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 3, Actual: 3
Predicted: 2, Actual: 3
42
Predicted: 4, Actual: 2
100
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 1, Actual: 2
101
Predicted: 4, Actual: 4
Predicted: 1, Actual: 1
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3


In [38]:
LR_parameters = {
    'C': [0.01, 0.1, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0],
    'penalty': ['l1', 'l2'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    'max_iter': [100, 150, 200, 400, 800, 1600],
    'class_weight': [None, 'balanced'],
    'warm_start': [False, True],
}
LR_best_paramters = get_best_model_parameters(LogisticRegression, x_scaled, y, 5, LR_parameters)

Best parameters combination found:
C: 1.0
class_weight: balanced
max_iter: 400
penalty: l1
solver: saga
warm_start: False


In [39]:
LR_best = LogisticRegression(
    C=LR_best_paramters['C'],
    class_weight=LR_best_paramters['class_weight'],
    max_iter=LR_best_paramters['max_iter'],
    penalty=LR_best_paramters['penalty'],
    solver=LR_best_paramters['solver']
)

train_and_evaluate_model(LR_best, x_scaled, y, 0.2)

Training Score: 0.7176470588235294
Average Cross Validation Score: 0.6470588235294119, Standard Deviation: 0.05261336417646565
Testing Score: 0.5909090909090909
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 1, Actual: 3
40
Predicted: 3, Actual: 4
23
Predicted: 1, Actual: 2
63
Predicted: 2, Actual: 3
60
Predicted: 4, Actual: 4
Predicted: 1, Actual: 2
72
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 3, Actual: 3
Predicted: 3, Actual: 3
Predicted: 4, Actual: 2
100
Predicted: 2, Actual: 2
Predicted: 1, Actual: 2
15
Predicted: 1, Actual: 2
101
Predicted: 4, Actual: 4
Predicted: 1, Actual: 1
Predicted: 2, Actual: 2
Predicted: 2, Actual: 3
59


In [40]:
# Observe weights
for i in range(len(LR_best.coef_)):
    print(f"Class: {i+1}")
    weights = list(LR_best.coef_[i])
    for j in range(len(x_data.columns)):
        print(x_data.columns[j], weights[j])
    print("")

Class: 1
total-operators 0.0
distinct-operators 0.0
total-operands 0.0
distinct-operands 0.0
function-calls 0.0
loops 0.0
assignments 0.0
max-cyclomatic-complexity -5.225546256908057
sum-cyclomatic-complexity 0.0
dynamic-memory-calls 0.0
file-access 0.0
file-operation 0.0
file-position 0.0
input-output 0.0
optimization -2.410147972295971

Class: 2
total-operators 0.0
distinct-operators 0.0
total-operands 0.0
distinct-operands 0.0
function-calls 0.0
loops -0.6671959594230252
assignments 0.0
max-cyclomatic-complexity 0.0
sum-cyclomatic-complexity 0.0
dynamic-memory-calls 0.0
file-access 0.0
file-operation 0.0
file-position 0.0
input-output 0.0
optimization 0.0

Class: 3
total-operators 0.0
distinct-operators 0.0
total-operands 0.0
distinct-operands 0.0
function-calls 0.0
loops 0.0
assignments 0.0
max-cyclomatic-complexity 0.0
sum-cyclomatic-complexity 0.0
dynamic-memory-calls 1.4979389242564038
file-access 0.0
file-operation 0.0
file-position 0.0
input-output 5.770605950714671
optimizati