In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [2]:
def model_fit_predict(X_train, y_train):
    kf = StratifiedKFold(n_splits = 10, random_state=42, shuffle=True)
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    for train_index, test_index in kf.split(X_train, y_train):
        x_train = X_train.iloc[train_index]
        Y_train = y_train.iloc[train_index]
        x_test = X_train.iloc[test_index]
        Y_test = y_train.iloc[test_index]
        model = SVC()
        model.fit(x_train, Y_train)
        predictions = model.predict(x_test)
        accuracies.append(accuracy_score(predictions, Y_test))
        precisions.append(precision_score(predictions, Y_test, average = None))
        recalls.append(recall_score(predictions, Y_test, average = None))
        f1s.append(f1_score(predictions, Y_test, average = None))
    precision = np.sum(precisions, axis=0)/len(precisions)
    recall = np.sum(recalls, axis=0)/len(recalls)
    f1 =  np.sum(f1s, axis=0)/len(f1s)
    print('Accuracy Score : ', sum(accuracies)/len(accuracies))
    print('Precision Score : ', precision)
    print('Recall Score : ', recall)
    print('F1 Score : ', f1)
    print('Avg Precision Score : ', np.mean(np.array(precision)))
    print('Avg Recall Score : ', np.mean(np.array(recall)))
    print('Avg F1 Score : ', np.mean(np.array(f1)))

In [3]:
with open('user-knowledge-modeling.names.txt') as fp:
    line = fp.readlines(1)
    col = line[0].splitlines()
    cols = list(col[0].split(','))

options = {'header': None, 'names': cols, 'skipinitialspace': True}


user_df = pd.read_csv('user-knowledge-modeling.data.txt', **options)

In [4]:
user_df

Unnamed: 0,STG,SCG,STR,LPR,PEG,class
0,0.00,0.00,0.00,0.00,0.00,very_low
1,0.08,0.08,0.10,0.24,0.90,High
2,0.06,0.06,0.05,0.25,0.33,Low
3,0.10,0.10,0.15,0.65,0.30,Middle
4,0.08,0.08,0.08,0.98,0.24,Low
...,...,...,...,...,...,...
253,0.61,0.78,0.69,0.92,0.58,High
254,0.78,0.61,0.71,0.19,0.60,Middle
255,0.54,0.82,0.71,0.29,0.77,High
256,0.50,0.75,0.81,0.61,0.26,Middle


In [5]:
X = user_df.iloc[:,:-1]
y = user_df.iloc[:,-1]

In [7]:
model_fit_predict(X, y)

Accuracy Score :  0.9109230769230772
Precision Score :  [0.98333333 0.9875     0.85277778 0.66666667]
Recall Score :  [0.95714286 0.84439394 0.97       1.        ]
F1 Score :  [0.96783217 0.9053878  0.89994693 0.77666667]
Avg Precision Score :  0.8725694444444444
Avg Recall Score :  0.9428841991341992
Avg F1 Score :  0.8874583904227868
