Classification Model to Predict Program Performance Score

In [192]:
import numpy as np
import pandas as pd
import math

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

  plt.style.use('seaborn')


In [193]:
x_data = pd.read_csv('x_all.csv')
x = x_data.to_numpy()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
    

In [194]:
def find_x_scaled_sample(sample: list):
    """
    Return index of sample from x_all data
    """
    for i in range(len(x_scaled)):
        found = True
        for j in range(len(sample)):
            if sample[j] != x_scaled[i][j]:
                found = False
        if found:
            return i - 2

In [195]:
y_data = pd.read_csv('y_all.csv')
y = y_data.to_numpy().ravel()
print(y_data)

    score
0       1
1       2
2       2
3       2
4       2
..    ...
86      2
87      2
88      1
89      1
90      2

[91 rows x 1 columns]


In [196]:
def get_best_model_parameters(model, x, y, cv, parameters):
    grid_search = GridSearchCV(model(), parameters, n_jobs=-1)
    grid_search.fit(x, y)

    print("Best parameters combination found:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print(f"{param_name}: {best_parameters[param_name]}")

    return best_parameters

def train_and_evaluate_model(model, x, y, test_size):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=50)
    model.fit(x_train, y_train)
    print(f"Training Score: {model.score(x_train, y_train)}")
    scores = cross_val_score(model, x_train, y_train)
    print(f"Average Cross Validation Score: {scores.mean()}, Standard Deviation: {scores.std()}")
    print(f"Testing Score: {model.score(x_test, y_test)}")
    predictions = model.predict(x_test)
    for i in range(len(predictions)):
        print(f"Predicted: {predictions[i]}, Actual: {y_test[i]}")
        if predictions[i] != y_test[i]:
            print(find_x_scaled_sample(list(x_test[i])))

In [197]:
NN_parameters = {
    'hidden_layer_sizes': [(5,), (5, 5), (10,), (10, 10), (25,), (25, 25), (50,), (100,)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [0.00001, 0.0001, 0.001, 0.01],
    'max_iter': [200, 500, 1000, 2000],
}
NN_best_parameters = get_best_model_parameters(MLPClassifier, x_scaled, y, 5, NN_parameters)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Best parameters combination found:
activation: relu
alpha: 0.001
hidden_layer_sizes: (100,)
max_iter: 2000
solver: lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [198]:
NN_best = MLPClassifier(
    activation=NN_best_parameters['activation'],
    alpha=NN_best_parameters['alpha'],
    hidden_layer_sizes=NN_best_parameters['hidden_layer_sizes'],
    solver=NN_best_parameters['solver'],
    max_iter=NN_best_parameters['max_iter'] 
)

train_and_evaluate_model(NN_best, x_scaled, y, 0.2)

Training Score: 1.0
Average Cross Validation Score: 0.8066666666666666, Standard Deviation: 0.07920237290015922
Testing Score: 0.631578947368421
Predicted: 1, Actual: 2
37
Predicted: 3, Actual: 3
Predicted: 1, Actual: 2
55
Predicted: 2, Actual: 2
Predicted: 1, Actual: 2
54
Predicted: 1, Actual: 3
38
Predicted: 4, Actual: 4
Predicted: 3, Actual: 3
Predicted: 4, Actual: 4
Predicted: 3, Actual: 2
40
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 3, Actual: 4
15
Predicted: 4, Actual: 4
Predicted: 3, Actual: 3
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 1, Actual: 2
85


In [199]:
LR_parameters = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 20.0, 30.0, 40.0, 50.0, 100.0],
    'penalty': ['l1', 'l2'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    'max_iter': [100, 150, 200, 400, 800, 1600],
    'class_weight': [None, 'balanced'],
    'warm_start': [False, True],
}
LR_best_paramters = get_best_model_parameters(LogisticRegression, x_scaled, y, 5, LR_parameters)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters combination found:
C: 1.0
class_weight: None
max_iter: 100
penalty: l2
solver: liblinear
warm_start: False




In [206]:
LR_best = LogisticRegression(
    C=LR_best_paramters['C'],
    class_weight=LR_best_paramters['class_weight'],
    max_iter=LR_best_paramters['max_iter'],
    penalty=LR_best_paramters['penalty'],
    solver=LR_best_paramters['solver']
)

train_and_evaluate_model(LR_best, x_scaled, y, 0.2)

Training Score: 0.7361111111111112
Average Cross Validation Score: 0.7238095238095238, Standard Deviation: 0.12181544346910081
Testing Score: 0.7894736842105263
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 2, Actual: 3
38
Predicted: 4, Actual: 4
Predicted: 2, Actual: 3
52
Predicted: 2, Actual: 4
82
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 3, Actual: 4
15
Predicted: 4, Actual: 4
Predicted: 3, Actual: 3
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 2, Actual: 2
