In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import precision_recall_curve

from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings

warnings.filterwarnings('ignore')

In [2]:
def load_data_from_csv(csv_filepath):
    df = pd.read_csv(csv_filepath)
    X, y = df.loc[:, df.columns != 'class'].values, df['class'].values
    return X, y
    
 
X, y = load_data_from_csv('data/mushrooms.csv')
print(X.shape, y.shape)

(8124, 22) (8124,)


In [3]:
def encode_labeled_data(data: np.array):
    new_data = data.copy()
    for i in range(data.shape[1]):
        new_data[:, i] = LabelEncoder().fit_transform(data[:, i])
    return new_data

encode_labeled_data(y.reshape(-1, 1)).shape

(8124, 1)

In [4]:
def one_hot_encode_labeled_data(data: np.array):
    new_data = None
    for i in range(data.shape[1]):
        encoder = OneHotEncoder()
        new_values = encoder.fit_transform(data[:, i].reshape(-1, 1))
        if new_data is None:
            new_data = new_values
        else:
            new_data = scipy.sparse.hstack([new_data, new_values])
    return new_data

one_hot_encode_labeled_data(X)

<8124x117 sparse matrix of type '<class 'numpy.float64'>'
	with 178728 stored elements in COOrdinate format>

In [5]:
def prepare_data(X: np.array, y: np.array, one_hot_encode=True):
    if one_hot_encode:
        new_X = one_hot_encode_labeled_data(X)
    else:
        new_X = encode_labeled_data(X)
    new_y = encode_labeled_data(y.reshape(-1, 1))
    
    new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y)
    
    new_Xs = new_X_train.astype(int), new_X_test.astype(int)
    new_ys = new_y_train.astype(int).ravel(), new_y_test.astype(int).ravel()
    
    return new_Xs, new_ys 

(X_train, X_test), (y_train, y_test) = prepare_data(X[:, :5], y, one_hot_encode=False)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6093, 5), (2031, 5), (6093,), (2031,))

In [6]:
y_train, y_train

(array([0, 0, 1, ..., 0, 1, 0]), array([0, 0, 1, ..., 0, 1, 0]))

In [19]:
def score_model(model):
    model.fit(X_train, y_train)

    y_train_predict = cross_val_predict(model, X_train, y_train, cv=3)
    y_test_predict = cross_val_predict(model, X_test, y_test, cv=3)

    print('======== TRAINING SET ========')

    print('Train confusion matrix:')
    print(confusion_matrix(y_train, y_train_predict))
    print()

    print(f'Precision: {precision_score(y_train, y_train_predict):.3f}')
    print(f'Recall:    {precision_score(y_train, y_train_predict):.3f}')
    print(f'F1 score:  {precision_score(y_train, y_train_predict):.3f}')
    print()
    print()

    print('======== TEST SET ========')

    print('Test confusion matrix:')
    print(confusion_matrix(y_test, y_test_predict))
    print()

    print(f'Precision: {precision_score(y_test, y_test_predict):.3f}')
    print(f'Recall:    {precision_score(y_test, y_test_predict):.3f}')
    print(f'F1 score:  {precision_score(y_test, y_test_predict):.3f}')


score_model(DummyClassifier())

Train confusion matrix:
[[1605 1550]
 [1526 1412]]

Precision: 0.477
Recall:    0.477
F1 score:  0.477


Test confusion matrix:
[[522 531]
 [506 472]]

Precision: 0.471
Recall:    0.471
F1 score:  0.471


In [21]:
model = LinearSVC()
score_model(model)

Train confusion matrix:
[[2291  864]
 [ 481 2457]]

Precision: 0.740
Recall:    0.740
F1 score:  0.740


Test confusion matrix:
[[775 278]
 [156 822]]

Precision: 0.747
Recall:    0.747
F1 score:  0.747


In [22]:
model = KNeighborsClassifier()
score_model(model)

Train confusion matrix:
[[3145   10]
 [  53 2885]]

Precision: 0.997
Recall:    0.997
F1 score:  0.997


Test confusion matrix:
[[1030   23]
 [  31  947]]

Precision: 0.976
Recall:    0.976
F1 score:  0.976


In [24]:
model = SVC()
score_model(model)

Train confusion matrix:
[[3139   16]
 [ 190 2748]]

Precision: 0.994
Recall:    0.994
F1 score:  0.994


Test confusion matrix:
[[1038   15]
 [ 107  871]]

Precision: 0.983
Recall:    0.983
F1 score:  0.983


In [25]:
model = RandomForestClassifier()
score_model(model)

Train confusion matrix:
[[3151    4]
 [  25 2913]]

Precision: 0.999
Recall:    0.999
F1 score:  0.999


Test confusion matrix:
[[1045    8]
 [  15  963]]

Precision: 0.992
Recall:    0.992
F1 score:  0.992
