Classification Model to Predict Program Performance Score

In [70]:
import sys
import os
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

In [71]:
x_data = pd.read_csv('x_all.csv')
x = x_data.to_numpy()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
    

In [72]:
def find_x_scaled_sample(sample: list):
    """
    Return index of sample from x_all data
    """
    for i in range(len(x_scaled)):
        found = True
        for j in range(len(sample)):
            if sample[j] != x_scaled[i][j]:
                found = False
        if found:
            return i - 2

In [73]:
y_data = pd.read_csv('y_all.csv')
y = y_data.to_numpy().ravel()

In [74]:
def get_best_model_parameters(model, x, y, cv, parameters):
    grid_search = GridSearchCV(model(), parameters, n_jobs=-1)
    grid_search.fit(x, y)
    print("Best parameters combination found:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print(f"{param_name}: {best_parameters[param_name]}")

    return best_parameters

def train_and_evaluate_model(model, x, y, test_size):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=50)
    model.fit(x_train, y_train)
    print(f"Training Score: {model.score(x_train, y_train)}")
    scores = cross_val_score(model, x_train, y_train)
    print(f"Average Cross Validation Score: {scores.mean()}, Standard Deviation: {scores.std()}")
    print(f"Testing Score: {model.score(x_test, y_test)}")
    predictions = model.predict(x_test)
    for i in range(len(predictions)):
        print(f"Predicted: {predictions[i]}, Actual: {y_test[i]}")
        if predictions[i] != y_test[i]:
            print(find_x_scaled_sample(list(x_test[i])))

In [75]:
NN_parameters = {
    'hidden_layer_sizes': [(5, 5), (5, 5, 5), (10,), (10, 10), (10, 10, 10), (15, 15), (20, 20), (25,), (25, 25), (50,)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.00001, 0.0001, 0.001, 0.01],
    'max_iter': [200, 500, 1000, 2000],
}
NN_best_parameters = get_best_model_parameters(MLPClassifier, x_scaled, y, 5, NN_parameters)

Best parameters combination found:
activation: logistic
alpha: 1e-05
hidden_layer_sizes: (20, 20)
max_iter: 2000
solver: lbfgs


In [76]:
NN_best = MLPClassifier(
    activation=NN_best_parameters['activation'],
    alpha=NN_best_parameters['alpha'],
    hidden_layer_sizes=NN_best_parameters['hidden_layer_sizes'],
    solver=NN_best_parameters['solver'],
    max_iter=NN_best_parameters['max_iter'] 
)

train_and_evaluate_model(NN_best, x_scaled, y, 0.2)

Training Score: 1.0
Average Cross Validation Score: 0.7502923976608187, Standard Deviation: 0.07396216025151989
Testing Score: 0.8260869565217391
Predicted: 2, Actual: 4
23
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 4, Actual: 4
Predicted: 4, Actual: 4
Predicted: 4, Actual: 4
Predicted: 3, Actual: 2
75
Predicted: 2, Actual: 2
Predicted: 2, Actual: 3
50
Predicted: 2, Actual: 2
Predicted: 1, Actual: 1
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 3, Actual: 3
Predicted: 4, Actual: 2
108
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 2, Actual: 2


In [80]:
LR_parameters = {
    'C': [0.01, 0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0],
    'penalty': [None, 'l1', 'l2', 'elasticnet'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    'max_iter': [100, 150, 200, 400, 800, 1600],
    'class_weight': [None, 'balanced'],
    'warm_start': [False, True],
}
LR_best_paramters = get_best_model_parameters(LogisticRegression, x_scaled, y, 5, LR_parameters)

Best parameters combination found:
C: 2.0
class_weight: balanced
max_iter: 100
penalty: l2
solver: liblinear
warm_start: False


In [81]:
LR_best = LogisticRegression(
    C=LR_best_paramters['C'],
    class_weight=LR_best_paramters['class_weight'],
    max_iter=LR_best_paramters['max_iter'],
    penalty=LR_best_paramters['penalty'],
    solver=LR_best_paramters['solver']
)

train_and_evaluate_model(LR_best, x_scaled, y, 0.2)

Training Score: 0.717391304347826
Average Cross Validation Score: 0.6076023391812866, Standard Deviation: 0.16451779785280013
Testing Score: 0.782608695652174
Predicted: 3, Actual: 4
23
Predicted: 1, Actual: 2
15
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 4, Actual: 4
Predicted: 4, Actual: 4
Predicted: 4, Actual: 4
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 2, Actual: 3
50
Predicted: 2, Actual: 2
Predicted: 1, Actual: 1
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 3, Actual: 3
Predicted: 4, Actual: 2
108
Predicted: 2, Actual: 2
Predicted: 4, Actual: 3
97
Predicted: 2, Actual: 2


In [82]:
# Observe weights
for i in range(len(LR_best.coef_)):
    print(f"Class: {i+1}")
    weights = list(LR_best.coef_[i])
    for j in range(len(x_data.columns)):
        print(x_data.columns[j], weights[j])
    print("")

Class: 1
total-operators -0.847745696099739
distinct-operators -0.2018404531556194
total-operands -0.9302296411278365
distinct-operands -0.6784515601620673
function-calls -0.7484387800922245
loops -0.21374206955878683
assignments 0.0009674277280126289
max-cyclomatic-complexity -2.464998649268045
sum-cyclomatic-complexity -1.3015946714757634
dynamic-memory-calls -1.5511604459658814
file-access -0.5595240159825657
file-operation -0.5934253382217445
file-position -0.5595240159825657
input-output -1.053512459754973
optimization -1.7251260581174819

Class: 2
total-operators -0.3958296427224467
distinct-operators 0.6188731598078155
total-operands -0.614204165067538
distinct-operands -0.34098336111546035
function-calls -0.6661653518553474
loops -0.5852305769083024
assignments 0.23037531551105525
max-cyclomatic-complexity 0.16211635645878564
sum-cyclomatic-complexity -1.105528546481407
dynamic-memory-calls -0.24429035847999978
file-access -0.36999209700846736
file-operation -0.9501982937102644