In [1]:
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier

from data_prepare import Data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix




## Data Preparation
Extract data from input file, convert raw text to numeric data

In [2]:
att_maps = [
{
    'e': 0,
    'p': 1
},
{
    'b': 0,
    'c': 1,
    'x': 2,
    'f': 3,
    'k': 4,
    's': 5
},
{
    'f': 0,
    'g': 1,
    'y': 2,
    's': 3
},
{
    'n': 0,
    'b': 1,
    'c': 2,
    'g': 3,
    'r': 4,
    'p': 5,
    'u': 6,
    'e': 7,
    'w': 8,
    'y': 9
},
{
    't': 0,
    'f': 1
},
{
    'a': 0,
    'l': 1,
    'c': 2,
    'y': 3,
    'f': 4,
    'm': 5,
    'n': 6,
    'p': 7,
    's': 8
},
{
    'a': 0,
    'd': 1,
    'f': 2,
    'n': 3
},
{
    'c': 0,
    'w': 1,
    'd': 2
},
{
    'b': 0,
    'n': 1
},
{
    'k': 0,
    'n': 1,
    'b': 2,
    'h': 3,
    'g': 4,
    'r': 5,
    'o': 6,
    'p': 7,
    'u': 8,
    'e': 9,
    'w': 10,
    'y': 11
},
{
    'e': 0,
    't': 1
},
{
    'b': 0,
    'c': 1,
    'u': 2,
    'e': 3,
    'z': 4,
    'r': 5,
    '?': 6,
},
{
    'f': 0,
    'y': 1,
    'k': 2,
    's': 3
},
{
    'f': 0,
    'y': 1,
    'k': 2,
    's': 3
},
{
    'n': 0,
    'b': 1,
    'c': 2,
    'g': 3,
    'o': 4,
    'p': 5,
    'e': 6,
    'w': 7,
    'y': 8
},
{
    'n': 0,
    'b': 1,
    'c': 2,
    'g': 3,
    'o': 4,
    'p': 5,
    'e': 6,
    'w': 7,
    'y': 8
},
{
    'p': 0,
    'u': 1
},
{
    'n': 0,
    'o': 1,
    'w': 2,
    'y': 3
},
{
    'n': 0,
    'o': 1,
    't': 2
},
{
    'c': 0,
    'e': 1,
    'f': 2,
    'l': 3,
    'n': 4,
    'p': 5,
    's': 6,
    'z': 7
},
{
    'k': 0,
    'n': 1,
    'b': 2,
    'h': 3,
    'r': 4,
    'o': 5,
    'u': 6,
    'w': 7,
    'y': 8
},
{
    'a': 0,
    'c': 1,
    'n': 2,
    's': 3,
    'v': 4,
    'y': 5
},
{
    'g': 0,
    'l': 1,
    'm': 2,
    'p': 3,
    'u': 4,
    'w': 5,
    'd': 6
},
]

class DataUtils:
    @staticmethod
    def load_data(file_name):
        labels = []
        features = []

        f = open(file_name, "r")

        for x in f:
            x = x.rstrip()
            att_arr = x.split(',')
            att_vec = []
            for att_idx, att_name in enumerate(att_arr):
                att_val = att_maps[att_idx][att_arr[att_idx]]
                att_vec.append(att_val)

            labels.append(att_vec[0])
            features.append(att_vec[1:])

        return labels, features

## Generate feature vectors

In [3]:
labels, features = Data.load_data(file_name="data/agaricus-lepiota.data")
global_labels = LabelEncoder().fit_transform(labels)

# normalize feature vector
scaler = MinMaxScaler(feature_range=(0, 1))
global_features = scaler.fit_transform(features)

## Train test split

In [4]:
test_size = 0.2

(trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal) = train_test_split(
    np.array(global_features),
    np.array(global_labels),
    test_size=test_size)
print("Train data - Total {}, feature size {}".format(trainDataGlobal.shape[0], trainDataGlobal.shape[1]))
print("Test data - Total {}, feature size {}".format(testDataGlobal.shape[0], testDataGlobal.shape[1]))

Train data - Total 6499, feature size 22
Test data - Total 1625, feature size 22


## Simple classification using perceptron

In [5]:
class_names = ['edible', 'poisonous']

clf = Perceptron(random_state=1, max_iter=30, tol=0.001)
# clf = MLPClassifier(solver='sgd', alpha=1e-5, learning_rate_init=0.1, verbose=10, hidden_layer_sizes=(15,15), random_state=1)

# fit the training data to the model
clf.fit(trainDataGlobal, trainLabelsGlobal)


predictions = clf.predict(testDataGlobal)
matrix = confusion_matrix(predictions, testLabelsGlobal)
acc_per_cls = matrix.diagonal()/matrix.sum(axis=0)

print('---------- Accuracy -----------')
for label, acc in zip(class_names, acc_per_cls):
    print('{} : {}'.format(label, acc))

---------- Accuracy -----------
edible : 0.8767772511848341
poisonous : 0.9846350832266325


## Other classifiers

In [6]:
scoring = "accuracy"
results = []
names = []
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=100)))
models.append(('SVM', SVC()))

for name, model in models:
    kfold = KFold(n_splits=5, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, global_features, global_labels, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

KNN: 0.999877 (0.000246)
RF: 1.000000 (0.000000)
SVM: 1.000000 (0.000000)
