In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score

import joblib

In [2]:
dataset = pd.read_csv('mcoords_damta.csv')

#dataset.head()

# Splitting Train and Test

In [3]:
X = dataset.iloc[:, 1:].values
Y = dataset.iloc[:, 0].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

In [5]:
X_train, X_test, y_train, y_test

(array([[0.18344615, 0.67910498, 0.17707908, ..., 0.62113601, 0.26521495,
         0.64336568],
        [0.51421183, 0.60958028, 0.42538562, ..., 0.38816893, 0.59633201,
         0.45696187],
        [0.03980647, 0.63123298, 0.07205419, ..., 0.59668422, 0.30508322,
         0.60654014],
        ...,
        [0.40693477, 0.81959766, 0.3473897 , ..., 0.67064297, 0.45819327,
         0.70329219],
        [0.47117472, 0.86708665, 0.30130658, ..., 0.57770383, 0.55218661,
         0.660743  ],
        [0.24226327, 0.78698474, 0.18610235, ..., 0.72059119, 0.37066349,
         0.74715924]]),
 array([[0.53662503, 0.4759309 , 0.45382869, ..., 0.26321688, 0.24853703,
         0.26789537],
        [0.42137009, 0.93033004, 0.2764295 , ..., 0.56385922, 0.65386593,
         0.51956207],
        [0.59532529, 0.98340786, 0.47308478, ..., 0.43155098, 0.69099259,
         0.34906361],
        ...,
        [0.40876806, 0.85123539, 0.29861593, ..., 0.66830546, 0.43814555,
         0.71413285],
        [0.3

## Normalizing to reduce computational power

In [6]:
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
X_train,X_test

(array([[-1.57072036, -0.21155633, -1.32163845, ...,  0.59272667,
         -1.47269232,  0.53830255],
        [ 0.25463511, -0.5991759 ,  0.21645563, ..., -0.6980636 ,
          0.24934396, -0.39938976],
        [-2.36340684, -0.47845608, -1.97219787, ...,  0.45724772,
         -1.26534994,  0.35305407],
        ...,
        [-0.33738151,  0.57172942, -0.26667732, ...,  0.86702768,
         -0.46907242,  0.83975898],
        [ 0.01713152,  0.83649373, -0.55213165, ...,  0.35208396,
          0.01975752,  0.62571802],
        [-1.24613364,  0.38990332, -1.26574532, ...,  1.14377352,
         -0.92428742,  1.06042935]]),
 array([[ 0.37832402, -1.34430835,  0.39264154, ..., -1.39037981,
         -1.55942894, -1.35047656],
        [-0.2577191 ,  1.18909322, -0.70622858, ...,  0.27537575,
          0.54855988, -0.08448353],
        [ 0.70226587,  1.48501681,  0.5119202 , ..., -0.45769862,
          0.74164403, -0.94216501],
        ...,
        [-0.32726436,  0.74811855, -0.5687984 , ...,  

# Making and Testing Accuracy of Different Models

# 1.1 K-Nearest Neighbours Model

In [8]:
classifier = KNeighborsClassifier(n_neighbors=3)

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

In [9]:
y_pred

array(['C', 'Y', 'I', ..., 'T', 'H', 'E'], dtype=object)

In [10]:
print(classification_report(y_test, y_pred))
print("Accuracy :",accuracy_score(y_test, y_pred))
print("f-1 score :",f1_score(y_test, y_pred, average='macro'))

              precision    recall  f1-score   support

           A       0.97      0.98      0.97       339
           B       0.98      1.00      0.99       322
           C       0.99      1.00      1.00       337
           D       0.99      0.98      0.98       328
           E       0.99      0.98      0.99       328
           F       1.00      0.99      0.99       345
           G       0.99      1.00      0.99       336
           H       1.00      0.99      0.99       345
           I       1.00      1.00      1.00       321
           J       0.98      1.00      0.99       317
           K       0.97      1.00      0.99       330
           L       1.00      1.00      1.00       324
           M       0.93      0.92      0.92       356
           N       0.95      0.92      0.93       355
           O       0.96      0.99      0.98       308
           P       0.99      0.99      0.99       299
           Q       0.99      1.00      0.99       331
           R       0.99    

# 1.2 Logistic Regression Model


In [11]:
classifier_2 = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
classifier_2.fit(X_train, y_train)

y_pred_2 = classifier_2.predict(X_test)

In [12]:
y_pred_2

array(['C', 'Y', 'I', ..., 'T', 'H', 'E'], dtype=object)

In [13]:
print(classification_report(y_test, y_pred_2))
print("Accuracy :",accuracy_score(y_test, y_pred_2))
print("f-1 score :",f1_score(y_test, y_pred_2, average='macro'))

              precision    recall  f1-score   support

           A       0.98      1.00      0.99       339
           B       0.98      1.00      0.99       322
           C       1.00      1.00      1.00       337
           D       1.00      0.99      1.00       328
           E       0.99      0.99      0.99       328
           F       1.00      0.99      0.99       345
           G       1.00      1.00      1.00       336
           H       0.99      0.98      0.99       345
           I       0.98      0.99      0.99       321
           J       0.99      0.99      0.99       317
           K       0.99      1.00      1.00       330
           L       1.00      1.00      1.00       324
           M       0.90      0.93      0.91       356
           N       0.94      0.89      0.91       355
           O       0.99      1.00      0.99       308
           P       1.00      0.99      0.99       299
           Q       1.00      0.99      0.99       331
           R       0.96    

# 1.3 Gaussian Naive Bayes Model

In [14]:
classifier_3 = GaussianNB(priors = None, var_smoothing = 1e-09)
classifier_3.fit(X_train, y_train)

y_pred_3 = classifier_3.predict(X_test)

In [15]:
y_pred_3

array(['C', 'T', 'T', ..., 'T', 'G', 'E'], dtype='<U1')

In [16]:
print(classification_report(y_test, y_pred_3))
print("Accuracy :",accuracy_score(y_test, y_pred_3))
print("f-1 score :",f1_score(y_test, y_pred_3, average='macro'))

              precision    recall  f1-score   support

           A       0.37      0.53      0.44       339
           B       0.64      0.60      0.62       322
           C       0.23      0.61      0.34       337
           D       0.57      0.49      0.53       328
           E       0.59      0.34      0.44       328
           F       0.99      0.68      0.80       345
           G       0.95      0.32      0.48       336
           H       0.69      0.74      0.71       345
           I       0.60      0.45      0.52       321
           J       0.79      0.60      0.68       317
           K       0.30      0.65      0.41       330
           L       0.59      0.46      0.52       324
           M       0.66      0.23      0.35       356
           N       0.22      0.55      0.32       355
           O       0.36      0.12      0.18       308
           P       0.49      0.68      0.57       299
           Q       0.80      0.38      0.51       331
           R       0.28    

# 1.4 Support Vector Machines Model

In [17]:
classifier_4 = SVC(kernel='rbf', random_state = 1)
classifier_4.fit(X_train,y_train)

y_pred_4 = classifier_4.predict(X_test)

In [18]:
y_pred_4

array(['C', 'Y', 'I', ..., 'T', 'H', 'E'], dtype=object)

In [19]:
print(classification_report(y_test, y_pred_4))
print("Accuracy :",accuracy_score(y_test, y_pred_4))
print("f-1 score :",f1_score(y_test, y_pred_4, average='macro'))

              precision    recall  f1-score   support

           A       0.98      0.99      0.98       339
           B       0.97      1.00      0.98       322
           C       0.99      1.00      0.99       337
           D       1.00      0.95      0.98       328
           E       1.00      1.00      1.00       328
           F       1.00      0.99      0.99       345
           G       1.00      1.00      1.00       336
           H       1.00      1.00      1.00       345
           I       0.98      1.00      0.99       321
           J       1.00      0.99      0.99       317
           K       1.00      0.94      0.97       330
           L       1.00      1.00      1.00       324
           M       0.91      0.91      0.91       356
           N       0.94      0.89      0.91       355
           O       0.96      1.00      0.98       308
           P       1.00      0.98      0.99       299
           Q       0.98      1.00      0.99       331
           R       0.99    

# 1.5 Decision Tree Model

In [20]:
classifier_5 = DecisionTreeClassifier()
classifier_5.fit(X_train,y_train)

y_pred_5 = classifier_5.predict(X_test)

In [21]:
y_pred_5

array(['C', 'Y', 'I', ..., 'T', 'H', 'E'], dtype=object)

In [22]:
print(classification_report(y_test, y_pred_5))
print("Accuracy :",accuracy_score(y_test, y_pred_5))
print("f-1 score :",f1_score(y_test, y_pred_5, average='macro'))

              precision    recall  f1-score   support

           A       0.94      0.96      0.95       339
           B       0.96      0.96      0.96       322
           C       0.94      0.95      0.95       337
           D       0.89      0.91      0.90       328
           E       0.93      0.90      0.92       328
           F       0.97      0.96      0.97       345
           G       0.96      0.94      0.95       336
           H       0.96      0.98      0.97       345
           I       0.94      0.96      0.95       321
           J       0.88      0.94      0.91       317
           K       0.93      0.97      0.95       330
           L       0.97      0.95      0.96       324
           M       0.85      0.82      0.83       356
           N       0.82      0.84      0.83       355
           O       0.89      0.91      0.90       308
           P       0.96      0.92      0.94       299
           Q       0.97      0.95      0.96       331
           R       0.90    

# 1.6 Random Forest Classifier Model

In [23]:
classifier_6 = RandomForestClassifier(n_estimators=100)
classifier_6.fit(X_train,y_train)

y_pred_6 = classifier_6.predict(X_test)

In [24]:
y_pred_6

array(['C', 'Y', 'I', ..., 'T', 'H', 'E'], dtype=object)

In [25]:
print(classification_report(y_test, y_pred_6))
print("Accuracy :",accuracy_score(y_test, y_pred_6))
print("f-1 score :",f1_score(y_test, y_pred_6, average='macro'))

              precision    recall  f1-score   support

           A       0.98      1.00      0.99       339
           B       0.97      1.00      0.99       322
           C       0.99      1.00      1.00       337
           D       1.00      0.98      0.99       328
           E       0.99      0.99      0.99       328
           F       1.00      0.99      0.99       345
           G       0.99      1.00      1.00       336
           H       1.00      0.99      0.99       345
           I       0.99      1.00      0.99       321
           J       1.00      1.00      1.00       317
           K       1.00      0.98      0.99       330
           L       1.00      1.00      1.00       324
           M       0.94      0.94      0.94       356
           N       0.96      0.93      0.95       355
           O       0.98      1.00      0.99       308
           P       1.00      0.99      0.99       299
           Q       0.99      1.00      0.99       331
           R       0.98    

# Saving the Models

In [26]:
joblib.dump(classifier, 'Trained_models/knn.pkl')

joblib.dump(classifier_2, 'Trained_models/logisticreg.pkl')

joblib.dump(classifier_3, 'Trained_models/gnb.pkl')

joblib.dump(classifier_4, 'Trained_models/svm.pkl')

joblib.dump(classifier_5, 'Trained_models/decisiontree.pkl')

joblib.dump(classifier_6, 'Trained_models/randomforest.pkl')

['Trained_models/randomforest.pkl']