Classification Model to Predict Program Performance Score

In [11]:
import sys
import os
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

In [12]:
x_data = pd.read_csv('x_all.csv')
x = x_data.to_numpy()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
    

In [13]:
def find_x_scaled_sample(sample: list):
    """
    Return index of sample from x_all data
    """
    for i in range(len(x_scaled)):
        found = True
        for j in range(len(sample)):
            if sample[j] != x_scaled[i][j]:
                found = False
        if found:
            return i - 2

In [14]:
y_data = pd.read_csv('y_all.csv')
y = y_data.to_numpy().ravel()

In [15]:
def get_best_model_parameters(model, x, y, cv, parameters):
    grid_search = GridSearchCV(model(), parameters, n_jobs=-1)
    grid_search.fit(x, y)
    print("Best parameters combination found:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print(f"{param_name}: {best_parameters[param_name]}")

    return best_parameters

def train_and_evaluate_model(model, x, y, test_size):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=50)
    model.fit(x_train, y_train)
    print(f"Training Score: {model.score(x_train, y_train)}")
    scores = cross_val_score(model, x_train, y_train)
    print(f"Average Cross Validation Score: {scores.mean()}, Standard Deviation: {scores.std()}")
    print(f"Testing Score: {model.score(x_test, y_test)}")
    predictions = model.predict(x_test)
    for i in range(len(predictions)):
        print(f"Predicted: {predictions[i]}, Actual: {y_test[i]}")
        if predictions[i] != y_test[i]:
            print(find_x_scaled_sample(list(x_test[i])))

In [16]:
NN_parameters = {
    'hidden_layer_sizes': [(5,), (5, 5), (10,), (10, 10), (25,), (25, 25), (50,), (100,)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.00001, 0.0001, 0.001, 0.01],
    'max_iter': [200, 500, 1000, 2000],
}
NN_best_parameters = get_best_model_parameters(MLPClassifier, x_scaled, y, 5, NN_parameters)

Best parameters combination found:
activation: logistic
alpha: 0.001
hidden_layer_sizes: (25, 25)
max_iter: 2000
solver: adam


In [17]:
NN_best = MLPClassifier(
    activation=NN_best_parameters['activation'],
    alpha=NN_best_parameters['alpha'],
    hidden_layer_sizes=NN_best_parameters['hidden_layer_sizes'],
    solver=NN_best_parameters['solver'],
    max_iter=NN_best_parameters['max_iter'] 
)

train_and_evaluate_model(NN_best, x_scaled, y, 0.2)

Training Score: 1.0
Average Cross Validation Score: 0.7849999999999999, Standard Deviation: 0.09300537618869138
Testing Score: 0.8
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 2, Actual: 2
Predicted: 2, Actual: 1
50
Predicted: 3, Actual: 2
48
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 1, Actual: 1
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 3, Actual: 3
Predicted: 3, Actual: 4
23
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 1, Actual: 2
93
Predicted: 3, Actual: 3


In [18]:
LR_parameters = {
    'C': [0.01, 0.1, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0],
    'penalty': ['l1', 'l2'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    'max_iter': [100, 150, 200, 400, 800, 1600],
    'class_weight': [None, 'balanced'],
    'warm_start': [False, True],
}
LR_best_paramters = get_best_model_parameters(LogisticRegression, x_scaled, y, 5, LR_parameters)

Best parameters combination found:
C: 3.0
class_weight: balanced
max_iter: 100
penalty: l2
solver: liblinear
warm_start: False


In [19]:
LR_best = LogisticRegression(
    C=LR_best_paramters['C'],
    class_weight=LR_best_paramters['class_weight'],
    max_iter=LR_best_paramters['max_iter'],
    penalty=LR_best_paramters['penalty'],
    solver=LR_best_paramters['solver']
)

train_and_evaluate_model(LR_best, x_scaled, y, 0.2)

Training Score: 0.7974683544303798
Average Cross Validation Score: 0.6966666666666667, Standard Deviation: 0.04357305487467123
Testing Score: 0.7
Predicted: 4, Actual: 2
83
Predicted: 4, Actual: 4
Predicted: 2, Actual: 2
Predicted: 2, Actual: 1
50
Predicted: 3, Actual: 2
48
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 1, Actual: 1
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 2, Actual: 3
1
Predicted: 3, Actual: 4
23
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 1, Actual: 2
93
Predicted: 3, Actual: 3


In [20]:
# Observe weights
for i in range(len(LR_best.coef_)):
    print(f"Class: {i+1}")
    weights = list(LR_best.coef_[i])
    for j in range(len(x_data.columns)):
        print(x_data.columns[j], weights[j])
    print("")

Class: 1
total-operators -0.65015400612366
distinct-operators 0.34487115394580625
total-operands -0.8486804580405449
distinct-operands -0.6503561006441376
function-calls -0.3547253698783957
loops -0.7102545809554834
assignments 0.13244209165684567
max-cyclomatic-complexity -3.8809481003874615
sum-cyclomatic-complexity -1.7559071940403186
dynamic-memory-calls -1.837204061900299
file-access -0.7036209577455077
file-operation -0.72981912931579
file-position -0.7036209577455077
input-output -1.4802148381687912
optimization -2.0018874979095616

Class: 2
total-operators -0.04758663848357379
distinct-operators 0.3357394221740785
total-operands -0.21973769828304182
distinct-operands 0.16210244215277245
function-calls -0.8514444153000937
loops -1.3310715121452534
assignments 0.46833709406302587
max-cyclomatic-complexity 0.5954147592655599
sum-cyclomatic-complexity -0.9303897751079943
dynamic-memory-calls -0.0298472648230249
file-access -0.25692616826182807
file-operation -1.6047826158685117
fil