In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Import the data

In [26]:
df = pd.read_csv("data.csv")
df.head()


Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,AZ,BA,BB,BC,BD,BE,BF,BG,BH,Categories
0,0.0294,0.0123,0.0117,0.0113,0.0497,0.0998,0.1326,0.1117,0.2984,0.3473,...,0.0056,0.0104,0.0079,0.0014,0.0054,0.0015,0.0006,0.0081,0.0043,BOAT
1,0.0093,0.0269,0.0217,0.0339,0.0305,0.1172,0.145,0.0638,0.074,0.136,...,0.0212,0.0091,0.0056,0.0086,0.0092,0.007,0.0116,0.006,0.011,CAR
2,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,CAR
3,0.0197,0.0394,0.0384,0.0076,0.0251,0.0629,0.0747,0.0578,0.1357,0.1695,...,0.0134,0.0097,0.0042,0.0058,0.0072,0.0041,0.0045,0.0047,0.0054,BOAT
4,0.0201,0.0178,0.0274,0.0232,0.0724,0.0833,0.1232,0.1298,0.2085,0.272,...,0.0131,0.0049,0.0104,0.0102,0.0092,0.0083,0.002,0.0048,0.0036,BOAT


In [27]:
df.shape

(180, 61)

In [28]:
df.isna().sum()

A             0
B             0
C             0
D             0
E             0
             ..
BE            0
BF            0
BG            0
BH            0
Categories    0
Length: 61, dtype: int64

In [29]:
df.duplicated().sum()

0

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 61 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   A           180 non-null    float64
 1   B           180 non-null    float64
 2   C           180 non-null    float64
 3   D           180 non-null    float64
 4   E           180 non-null    float64
 5   F           180 non-null    float64
 6   G           180 non-null    float64
 7   H           180 non-null    float64
 8   I           180 non-null    float64
 9   J           180 non-null    float64
 10  K           180 non-null    float64
 11  L           180 non-null    float64
 12  M           180 non-null    float64
 13  N           180 non-null    float64
 14  O           180 non-null    float64
 15  P           180 non-null    float64
 16  Q           180 non-null    float64
 17  R           180 non-null    float64
 18  S           180 non-null    float64
 19  T           180 non-null    f

Splitting the numerical and categorical 

In [58]:
X = df.drop(columns=['Categories'], axis=1)
Y = df['Categories']
print(Y.name)
print(X.columns)

Categories
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'AA', 'AB',
       'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AJ', 'AK', 'AL', 'AM', 'AN',
       'AO', 'AP', 'AQ', 'AR', 'AS', 'AT', 'AU', 'AV', 'AW', 'AX', 'AY', 'AZ',
       'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH'],
      dtype='object')


Preprocessing transformers 

In [104]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

y_processor = LabelEncoder()
y = y_processor.fit_transform(Y)

x_processor = StandardScaler()
x = x_processor.fit_transform(X)


In [107]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=9)
X_train.shape, y_train.shape

((144, 60), (144,))

Evaluation metrics 

In [124]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score

def evaluation(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    return f1, accuracy, precision, recall

Model training

In [109]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier



In [125]:
models = {
    "K-Neighbours": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Support Vector": SVC(),
    "Random Forest": RandomForestClassifier()
}

model_list = list(models.values())

for i in range(len(model_list)):
    model = model_list[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    f1_train, accuracy_train, precision_train, recall_train = evaluation(y_train, y_train_pred)
    f1_test, accuracy_test, precision_test, recall_test = evaluation(y_test, y_test_pred)


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- F1 score: {:.4f}".format(f1_train))
    print("- Accuracy: {:.4f}".format(accuracy_train))
    print("- Precision: {:.4f}".format(precision_train))
    print("- Recall: {:.4f}".format(recall_train))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- F1 score: {:.4f}".format(f1_test))
    print("- Accuracy: {:.4f}".format(accuracy_test))
    print("- Precision: {:.4f}".format(precision_test))
    print("- Recall: {:.4f}".format(recall_train))

    print('='*35)
    print('\n')

K-Neighbours
Model performance for Training set
- F1 score: 0.8750
- Accuracy: 0.8750
- Precision: 0.8750
- Recall: 0.8750
----------------------------------
Model performance for Test set
- F1 score: 0.8333
- Accuracy: 0.8333
- Precision: 0.8333
- Recall: 0.8750


Decision Tree
Model performance for Training set
- F1 score: 1.0000
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- F1 score: 0.5833
- Accuracy: 0.5833
- Precision: 0.5833
- Recall: 1.0000


Support Vector
Model performance for Training set
- F1 score: 0.9861
- Accuracy: 0.9861
- Precision: 0.9861
- Recall: 0.9861
----------------------------------
Model performance for Test set
- F1 score: 0.8333
- Accuracy: 0.8333
- Precision: 0.8333
- Recall: 0.9861


Random Forest
Model performance for Training set
- F1 score: 1.0000
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- F1 scor