In [5]:
from IPython.display import Image
from IPython.display import display
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# inline plotting instead of popping out
%matplotlib inline

# load utility classes/functions that has been taught in previous labs
# e.g., plot_decision_regions()
import os, sys
module_path = os.path.abspath(os.path.join('.'))
sys.path.append(module_path)

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')

#df = df.sample(n=2000, random_state=0)

df.columns = ['classes', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 'gill-attachment', 
             'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
             'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
             'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


label_le = LabelEncoder()
df['classes'] = label_le.fit_transform(df['classes'].values)

df = df.replace('?', np.nan)
df = df.dropna(axis=1)

df = pd.get_dummies(df)
display(df.head())

from sklearn.pipeline import Pipeline

X = df.drop('classes', 1).values
y = df['classes'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipe_knn = Pipeline([('scl', StandardScaler()),
                     ('clf', KNeighborsClassifier(n_neighbors=10, p=2, metric='minkowski'))])

pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('clf', SVC(kernel='rbf', random_state=0, gamma=0.001, C=100.0))])    

pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print('[KNN:]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
print('\n[SVC:]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))


'''
先用LableEncode()處理classes。
接著看過dataset之後發現missing data只在stalk-root這個feature中出現，且missing data占了這個feature很大的比例，所以決定直接捨棄這行，
用剩下的所有feature來train model。又因為所有的cloumns都是String，所以直接使用get_dummies做one-hot encoding。
最後把z-normalization和classifier包進pipeline，train，test。
'''

Unnamed: 0,classes,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0




[KNN:]
Misclassified samples: 0
Accuracy: 1.0000





[SVC:]
Misclassified samples: 0
Accuracy: 1.0000


'\n先用LableEncode()處理classes。\n接著看過dataset之後發現missing data只在stalk-root這個feature中出現，且missing data占了這個feature很大的比例，所以決定直接捨棄這行，\n用剩下的所有feature來train model。又因為所有的cloumns都是String，所以直接使用get_dummies做one-hot encoding。\n最後把z-normalization和classifier包進pipeline，train，test。\n'