##### Data: 6/21/2022

### Import Libraries

In [291]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import random

In [21]:
# !pip install openpyxl

### 1. Loading Dataset

In [142]:
def load_dataset(path):
    dataset = pd.read_excel('{}'.format(path))
    return dataset

In [143]:
dataset = load_dataset('./PROJECT DM.xlsx')
dataset.head()

Unnamed: 0,id,M,F,L,R,Label
0,1,200000,1,13,13,4
1,2,324000,1,14,14,4
2,3,400000,1,13,13,4
3,4,420000,1,14,14,4
4,5,489000,1,13,13,4


In [144]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11656 entries, 0 to 11655
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      11656 non-null  int64
 1   M       11656 non-null  int64
 2   F       11656 non-null  int64
 3   L       11656 non-null  int64
 4   R       11656 non-null  int64
 5   Label   11656 non-null  int64
dtypes: int64(6)
memory usage: 546.5 KB


In [145]:
dataset.describe()

Unnamed: 0,id,M,F,L,R,Label
count,11656.0,11656.0,11656.0,11656.0,11656.0,11656.0
mean,5828.5,3143431.0,3.950841,22.596002,9.696723,2.975635
std,3364.941703,3839804.0,4.260549,7.702204,5.824569,1.458097
min,1.0,25000.0,1.0,13.0,1.0,1.0
25%,2914.75,1172900.0,2.0,15.0,4.0,2.0
50%,5828.5,2026435.0,3.0,22.0,10.0,3.0
75%,8742.25,3712050.0,5.0,29.0,14.0,4.0
max,11656.0,120851000.0,130.0,37.0,36.0,5.0


In [146]:
dataset.isna().sum()

id       0
M        0
F        0
L        0
R        0
Label    0
dtype: int64

##### So, It seams that theres no need for any preprocessing

### 2. Splitting the dataset into 80% of train and 20% of test

In [147]:
# We're ignoring the id attr
X = dataset.iloc[:, 1:-1]
y = dataset.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [148]:
x_train.shape

(9324, 4)

In [149]:
x_test.shape

(2332, 4)

### 3. Normalization

In [150]:
scaler  = MinMaxScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_test  = pd.DataFrame(scaler.transform(x_test), columns=x_train.columns)

In [152]:
x_train.head()

Unnamed: 0,M,F,L,R
0,0.017807,0.015504,0.75,0.428571
1,0.02694,0.015504,0.666667,0.0
2,0.014732,0.023256,0.958333,0.428571
3,0.0434,0.031008,0.541667,0.457143
4,0.023591,0.054264,0.375,0.228571


### 4. Define Some functions

In [153]:
def get_confusion_matrix(y_true, y_pred):
    return confusion_matrix(y_true, y_pred)

In [157]:
def get_f1_score(y_true, y_pred):
    return round(f1_score(y_true, y_pred, average='weighted'), 3)

In [331]:
def viewer(clfs, clf_names, x_train, x_test, y_train, y_test):
    print('                train acc   -   test acc   -   test f1')
    print('------------------------------------------------------')
    for i, (clf, clf_name) in enumerate(zip(clfs, clf_names)):
        y_pred = clf.predict(x_test)
        train_acc = clf.score(x_train, y_train)
        test_acc = clf.score(x_test, y_test)
        f1score = get_f1_score(y_test, y_pred)
        print(' |_  {}. {:3s}: {:>9.2f}      -    {:.2f}      -    {:.2f}'.format(i+1, clf_name, train_acc, test_acc, f1score))
    
    print('\n\nConfusion Matrices:\n')
    for i, (clf, clf_name) in enumerate(zip(clfs, clf_names)):
        print('{}: ---------------------------------------------------\n'.format(clf_name))
        
        train_y_pred = clf.predict(x_train)
        train_confusion_matrix = get_confusion_matrix(y_train, train_y_pred)
        print(' |_ Train:\n\n{}\n'.format(train_confusion_matrix))
        
        test_y_pred = clf.predict(x_test)
        test_confusion_matrix = get_confusion_matrix(y_test, test_y_pred)
        print(' |_ Test:\n\n{}\n'.format(test_confusion_matrix))

In [294]:
def random_forest(x_train, x_test, y_train, y_test):
    nb = RandomForestClassifier(n_estimators=10, random_state=0)
    nb = nb.fit(x_train, y_train)
    return nb

In [293]:
def logistic_regression(x_train, x_test, y_train, y_test):
    lr = LogisticRegression(solver='newton-cg', multi_class='multinomial', random_state=0)
    lr = lr.fit(x_train, y_train)
    return lr

In [288]:
def naive_bayes(x_train, x_test, y_train, y_test):
    nb = MultinomialNB()
    nb = nb.fit(x_train, y_train)
    return nb

In [266]:
def knn_classifier(x_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
    knn = knn.fit(x_train, y_train)
    return knn

In [292]:
def poly_svm(x_train, x_test, y_train, y_test):
    p_svm = svm.SVC(kernel='poly', random_state=0)
    p_svm = p_svm.fit(x_train, y_train)
    return p_svm

In [262]:
def decision_tree(x_train, x_test, y_train, y_test):
    dtree = DecisionTreeClassifier(max_depth=5, random_state=0)
    dtree = dtree.fit(x_train, y_train)
    return dtree

### 5. Lets start classifying

In [295]:
clfs = []
clfs_names = []

# Logistic Regression
lr = logistic_regression(x_train, x_test, y_train, y_test)
clfs.append(lr)
clfs_names.append('LR')

# SVM
p_svm = poly_svm(x_train, x_test, y_train, y_test)
clfs.append(p_svm)
clfs_names.append('SVM')

# Decision Tree
dtree = decision_tree(x_train, x_test, y_train, y_test)
clfs.append(dtree)
clfs_names.append('DT')

# KNN
knn = knn_classifier(x_train, x_test, y_train, y_test)
clfs.append(knn)
clfs_names.append('KNN')

# Naive bayes
nb = naive_bayes(x_train, x_test, y_train, y_test)
clfs.append(nb)
clfs_names.append('NB')

# Random Forest
rf = random_forest(x_train, x_test, y_train, y_test)
clfs.append(rf)
clfs_names.append('RF')

### 6. And here we go ))

In [332]:
viewer(clfs, clfs_names, x_train, x_test, y_train, y_test)

                train acc   -   test acc   -   test f1
------------------------------------------------------
 |_  1. LR :      0.70      -    0.69      -    0.68
 |_  2. SVM:      0.77      -    0.76      -    0.76
 |_  3. DT :      0.98      -    0.98      -    0.98
 |_  4. KNN:      0.92      -    0.85      -    0.85
 |_  5. NB :      0.48      -    0.48      -    0.36
 |_  6. RF :      1.00      -    1.00      -    1.00


Confusion Matrices:

LR: ---------------------------------------------------

 |_ Train:

[[1742  270  104   55   37]
 [ 269  899  177  173   65]
 [ 358  394  519  248   63]
 [   0    0  116 1836  178]
 [   2    0    0  263 1556]]

 |_ Test:

[[426  79  17  12  18]
 [ 77 218  53  37  11]
 [ 88  90 142  55  21]
 [  0   0  27 451  55]
 [  0   0   0  81 374]]

SVM: ---------------------------------------------------

 |_ Train:

[[1783  397   18   10    0]
 [ 128 1259   66   80   50]
 [  28  882  469  203    0]
 [   0  101   51 1962   16]
 [   0    3    2  113 1703]]

In [None]:
# Notice that the Random forest classifier, have 1 wrong prediction according to the confusion matrix, 
# but its shown 100% in the table because we round the accuracies for better appearance

##### Finito