In [1]:
# inline plotting instead of popping out
%matplotlib inline

# python 3.7.3
import os, itertools, csv

from IPython.display import Image
from IPython.display import display

# numpy  1.17.1
import numpy as np

# pandas  0.25.1
import pandas as pd

# scikit-learn  0.21.3
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_moons
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import SVC

# matplotlib  3.1.1
import matplotlib
matplotlib.rcParams.update({'font.size': 22})
plt = matplotlib.pyplot

# load utility classes/functions that has been taught in previous labs
# e.g., plot_decision_regions()
from lib import *

# Make output directory
if not os.path.exists("output/") : os.mkdir("output/")

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')
column_name = ['classes','cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat']
df.columns = column_name
df.head()

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
# Todo
# deal missing value denoted by '?'
display(df.isin(['?']).any())
print(df['stalk-root'].value_counts())
df_f = df.replace({'?':np.nan})
 
# drop columns with missing values
df_drop_col = df_f.dropna(axis=1)
X = df_drop_col.drop(['classes'], axis=1)
Y = df_drop_col['classes']

# encode label, feature,
label_le = LabelEncoder()
Y = label_le.fit_transform(Y.values)
df_dummy_drop_row = pd.get_dummies(X)

#split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df_dummy_drop_row, Y, test_size=0.2, random_state=1)


# Optional
# feature selection

classes                     False
cap-shape                   False
cap-surface                 False
cap-color                   False
bruises?                    False
odor                        False
gill-attachment             False
gill-spacing                False
gill-size                   False
gill-color                  False
stalk-shape                 False
stalk-root                   True
stalk-surface-above-ring    False
stalk-surface-below-ring    False
stalk-color-above-ring      False
stalk-color-below-ring      False
veil-type                   False
veil-color                  False
ring-number                 False
ring-type                   False
spore-print-color           False
population                  False
habitat                     False
dtype: bool

b    3776
?    2480
e    1120
c     556
r     192
Name: stalk-root, dtype: int64


In [4]:
# KNN
# train 
knn = KNeighborsClassifier(n_neighbors=11, p=2, metric='minkowski')
knn.fit(X_train, y_train)

# test
y_pred = knn.predict(X_test)
print('[KNN]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))


[KNN]
Misclassified samples: 2
Accuracy: 1.00


In [5]:
# Linear SVC
# train
svm_linear = SVC(kernel='linear', C=1000.0, random_state=0)
svm_linear.fit(X_train, y_train)

# test
y_pred = svm_linear.predict(X_test)
print('[Linear SVC]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

[Linear SVC]
Misclassified samples: 0
Accuracy: 1.00


In [6]:
# Linear SVC
# train
svm_rbf = SVC(kernel='rbf', random_state=0, gamma=0.2, C=10.0)
svm_rbf.fit(X_train, y_train)

# test
y_pred = svm_rbf.predict(X_test)
print('[Nonlinear SVC]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

[Nonlinear SVC]
Misclassified samples: 0
Accuracy: 1.00


<h2>Conclusion</h2>

Preprocess:

    1. eliminate column with NaN
    這個dataset有 '?' 的只有 'stalk-root' 那個column，且'?'占那行的比例極高 2480/8124(大約30%)
    故覺得比起imputing一堆不一定準的資料，不如直接將那個feature的column砍掉。
    
    2. encode with get_dummies
    這個dataset全部都是categorical features，故直接使用get_dummies 作 encode
    且因為做完encode後值全部是0或1，故也沒有額外進行標準化。
    

Result:

|model|misclassfied|accuracy|
|-----|------------|--------|
|KNN|2|1.00|
|SVC(linear)|0|1.00|
|SVC(nonlinear)|0|1.00|

從結果可以看到結果都好到不可思議，都將近100%的accuracy，不過如果一定要3個model選一個的話，我可能會選SVC(linear)，一方面是因為跟KNN比他是全對，另一方面跟SVC(nonlinear)他的運算速度快很多