In [1]:
#! jupyter nbconvert --to html dataset-preprocess.ipynb

In [2]:
""" 
dependencies:
  - python=3.8.17
  - numpy=1.24.0
  - matplotlib=3.7.1
  - pandas=2.0.2 
"""

' \ndependencies:\n  - python=3.8.17\n  - numpy=1.24.0\n  - matplotlib=3.7.1\n  - pandas=2.0.2 \n'

In [3]:
import os
import random
import datetime
from itertools import product 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
# import random
# random.seed(42)
# np.random.seed(42)
# np.random.RandomState(42)
# os.environ['TF_DETERMINISTIC_OPS'] = '1' 

finish_sound = "afplay /Users/mehmet/Documents/vs-code/winsquare.mp3"
# play sound when finished
# os.system(finish_sound)

In [4]:
# Read data from npy file ( already preprocessed )
filename = 'original-numpy'
# filename = 'pca-numpy'
X_train = np.load(f'dataset/{filename}/X_train.npy')
X_val = np.load(f'dataset/{filename}/X_val.npy')
X_test = np.load(f'dataset/{filename}/X_test.npy')
y_train = np.load(f'dataset/{filename}/y_train.npy')
y_val = np.load(f'dataset/{filename}/y_val.npy')
y_test = np.load(f'dataset/{filename}/y_test.npy')

# Remove one hot encoding from y
y_train = np.argmax(y_train, axis=1)
y_val = np.argmax(y_val, axis=1)
y_test = np.argmax(y_test, axis=1)

print(X_train.shape, y_train.shape,'\n', X_val.shape, y_val.shape,'\n', X_test.shape, y_test.shape)

(5120, 10859) (5120,) 
 (640, 10859) (640,) 
 (640, 10859) (640,)


In [5]:
class EvaluateModel():
    # Class to evaluate model performance, similar to sklearn.metrics ClassificationReport and ConfusionMatrix
    def __init__(self, y_true, y_pred, str1, now, save=True, print_result=True):
        self.y_true = np.argmax(y_true, axis=1)
        self.y_pred = y_pred
        if save == True:
            os.mkdir('model-comparison/'+now+'/'+str1)
            np.savetxt('model-comparison/{}/{}/pred.csv'.format(now,str1), y_pred, delimiter=',', fmt='%d')
        
        result = self.classification_report()
        fpr0 = 100 - float(result['precision'][0][0:4])
        line1 = 'Accuracy is: ' + str(result['f1-score']['accuracy'])
        line2 = 'F1 Score is: ' + str(result['f1-score']['weighted avg'])
        line3 = 'Precision of Class 0 is: ' + '{0:.2f}'.format(100-fpr0)+ ' %'
        line4 = '\nClassification Report:'
        line5 = '\nConfusion Matrix:'
        cm = self.confusion_matrix()
        line6 = '\n'
        res_total = line1 + '\n' + line2 + '\n' + line3 + '\n' + line4 + '\n' + str(result) + '\n' + line5 + '\n' + str(cm) + '\n' + line6
        # write to file
        if save == True:
            with open('model-comparison/{}/{}/report.txt'.format(now,str1), 'w') as f:
                f.write(res_total)
        if print_result == True:
            print(res_total)

    def accuracy_score(self, y_t, y_p):
        correct = sum(y_t == y_p)
        return correct / len(y_t)

    def scores(self, y_t, y_p, class_label= 1):
        true = y_t == class_label
        pred = y_p == class_label
        tp = sum(true & pred)
        fp = sum(~true & pred) 
        fn = sum(true & ~pred)
        tn = sum(~true & ~pred) 
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)
        return precision, recall, f1
    
    def confusion_matrix(self,labels=None):
        labels = labels if labels else sorted(set(self.y_true) | set(self.y_pred))        
        indexes = {v:i for i, v in enumerate(labels)}
        matrix = np.zeros((len(indexes),len(indexes))).astype(int)
        for t, p in zip(self.y_true, self.y_pred):
            matrix[indexes[t], indexes[p]] += 1
        # print('Confusion Matrix: ')
        # print(pd.DataFrame(matrix, index=labels, columns=labels))
        return pd.DataFrame(matrix, index=labels, columns=labels)

    def classification_report(self):
        output_dict = {}
        support_list = []
        precision_list = []
        recall_list = []
        f1_list = []
        for i in np.unique(self.y_true):
            support = sum(self.y_true == i)
            precision, recall, f1 = self.scores(self.y_true, self.y_pred, class_label=i)
            output_dict[i] = {'precision':precision, 'recall':recall, 'f1-score':f1, 'support':support}
            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)
            support_list.append(support)
        support = np.sum(support_list)
        output_dict['accuracy'] = {'precision':0, 'recall':0, 'f1-score':self.accuracy_score(self.y_true, self.y_pred), 'support':support}
        # macro avg
        macro_precision = np.mean(precision_list)
        macro_recall = np.mean(recall_list)
        macro_f1 = np.mean(f1_list)
        output_dict['macro avg'] = {'precision':macro_precision, 'recall':macro_recall, 'f1-score':macro_f1, 'support':support}
        # weighted avg
        weighted_precision = np.average(precision_list, weights=support_list)
        weighted_recall = np.average(recall_list, weights=support_list)
        weighted_f1 = np.average(f1_list, weights=support_list)
        output_dict['weighted avg'] = {'precision':weighted_precision, 'recall':weighted_recall, 'f1-score':weighted_f1, 'support':support}
        # convert to dataframe and format
        report_d = pd.DataFrame(output_dict).T
        annot = report_d.copy()
        annot.iloc[:, 0:3] = (annot.iloc[:, 0:3]*100).applymap('{:.2f}'.format) + ' %'
        annot['support'] = annot['support'].astype(int)
        annot.loc['accuracy','precision'] = ''
        annot.loc['accuracy','recall'] = ''
        return annot

In [6]:
from sklearn import linear_model

model1 = linear_model.LogisticRegression(max_iter=100, 
                                            penalty='l2', 
                                            solver='liblinear', 
                                            multi_class='ovr',
                                            verbose=1,
                                            )
history = model1.fit(X_train, y_train)
y_pred = model1.predict(X_val)

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

#Validation Results
poly_pred = model1.predict(X_val)
poly_accuracy = accuracy_score(y_val, poly_pred)
poly_f1 = f1_score(y_val, poly_pred, average='weighted')
print('Val. Accuracy: ', "%.2f" % (poly_accuracy*100))
cm = confusion_matrix(y_val, poly_pred)
print(cm)

# Test Results
poly_pred = model1.predict(X_test)
poly_accuracy = accuracy_score(y_test, poly_pred)
poly_f1 = f1_score(y_test, poly_pred, average='weighted')
print('Test Accuracy: ', "%.2f" % (poly_accuracy*100))
cm = confusion_matrix(y_test, poly_pred)
print(cm)

[LibLinear]iter  1 act 2.263e+03 pre 1.953e+03 delta 8.243e-01 f 3.549e+03 |g| 2.522e+04 CG  11
cg reaches trust region boundary
iter  2 act 7.600e+02 pre 6.043e+02 delta 1.115e+00 f 1.286e+03 |g| 6.864e+03 CG  12
iter  3 act 3.219e+02 pre 2.545e+02 delta 1.213e+00 f 5.257e+02 |g| 2.839e+03 CG  15
iter  4 act 1.215e+02 pre 9.614e+01 delta 1.213e+00 f 2.038e+02 |g| 1.137e+03 CG  15
iter  5 act 4.353e+01 pre 3.462e+01 delta 1.213e+00 f 8.231e+01 |g| 4.364e+02 CG  14
iter  6 act 1.434e+01 pre 1.160e+01 delta 1.213e+00 f 3.878e+01 |g| 1.661e+02 CG  14
iter  7 act 3.642e+00 pre 3.038e+00 delta 1.213e+00 f 2.444e+01 |g| 6.196e+01 CG  12
iter  8 act 6.704e-01 pre 5.805e-01 delta 1.213e+00 f 2.079e+01 |g| 2.200e+01 CG  13
iter  9 act 1.335e-01 pre 1.259e-01 delta 1.213e+00 f 2.012e+01 |g| 6.132e+00 CG  23
iter  1 act 2.547e+03 pre 2.208e+03 delta 1.536e+00 f 3.549e+03 |g| 1.296e+04 CG  17
iter  2 act 6.326e+02 pre 5.050e+02 delta 1.576e+00 f 1.002e+03 |g| 3.659e+03 CG  22
iter  3 act 2.209e+02

In [14]:
from sklearn import svm

poly = svm.SVC(kernel='linear', degree=3, C=1, verbose=1).fit(X_train, y_train)

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

#Validation Results
poly_pred = poly.predict(X_val)
poly_accuracy = accuracy_score(y_val, poly_pred)
poly_f1 = f1_score(y_val, poly_pred, average='weighted')
print('Val. Accuracy: ', "%.2f" % (poly_accuracy*100))
cm = confusion_matrix(y_val, poly_pred)
print(cm)

# Test Results
poly_pred = poly.predict(X_test)
poly_accuracy = accuracy_score(y_test, poly_pred)
poly_f1 = f1_score(y_test, poly_pred, average='weighted')
print('Test Accuracy: ', "%.2f" % (poly_accuracy*100))
cm = confusion_matrix(y_test, poly_pred)
print(cm)

[LibSVM]................*........*
optimization finished, #iter = 24903
obj = -0.342654, rho = -0.205053
nSV = 1816, nBSV = 0
....*..*
optimization finished, #iter = 6921
obj = -0.073576, rho = -0.913113
nSV = 1023, nBSV = 0
*
optimization finished, #iter = 897
obj = -0.004285, rho = -1.208193
nSV = 287, nBSV = 0
.....*...*
optimization finished, #iter = 8865
obj = -0.091238, rho = -0.929936
nSV = 1128, nBSV = 0
*
optimization finished, #iter = 830
obj = -0.004497, rho = -1.266181
nSV = 276, nBSV = 0
*.*
optimization finished, #iter = 772
obj = -0.003949, rho = -1.026490
nSV = 243, nBSV = 0
Total nSV = 3164
