# K-Nearest Neighbor  
### KNN ：沒有參數的 分類 或 回歸 模型
ref: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
- 鳶尾花資料集 來源： (User Guide 可以看欄位簡介)
- https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html
1. Classes：3  (分3種花)
2. Samples per class：50  (每一種類50筆)
3. Samples total：150 (共150筆)
4. Dimensionality：4 (維度4，每一筆特徵有4個：花萼長寬.花蕊長寬)
5. 目標：用前面4欄的特性，去看應該是屬於哪一類別的花

In [17]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.metrics import accuracy_score, confusion_matrix

iris = datasets.load_iris()
X = iris.data[:, :2] #  此示範只選前兩個特徵
y = iris.target # 花的種類

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train) 
X_train = scaler.transform(X_train) 

model = neighbors.KNeighborsClassifier(n_neighbors=4) # n_neighbors = K值
model.fit(X_train, y_train)

X_test = scaler.transform(X_test) 

y_pred = model.predict(X_test) 

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False) #normalize=False(猜對幾個樣本)
con_matrix = confusion_matrix(y_test, y_pred)  # 混淆矩陣

print(y_pred)
print('accuracy:',accuracy)
print('number of correct sample:',num_correct_samples)
print('confusion matrix:',con_matrix)

[0 1 1 0 1 0 1 1 0 1 2 1 0 0 1 0 1 2 0 1 2 2 0 2 1 0 2 0 0 1]
accuracy: 0.8
number of correct sample: 24
confusion matrix: [[12  1  0]
 [ 0  8  2]
 [ 0  3  4]]


### KNN 練習
- 小麥資料集 來源： (共7個欄位,210筆資料)
- https://archive.ics.uci.edu/ml/datasets/seeds#
1. 前面幾欄是 特徵：面積.周長.籽粒長度....
2. 最後一欄是小麥的 類別 (有3種 數字1.2.3)
3. 目標：用前面幾欄特徵，分類出是哪種小麥品種

In [8]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.metrics import accuracy_score

df = pd.read_csv('./dataset/seeds_dataset.csv', header=None) #沒有欄位名稱所以header=None，讓他變出欄位索引
print(df.head())

X = df[[0, 1, 2, 3, 4, 5, 6]]
y = df[7]-1  #因為他的種類索引是從1開始，我們習慣從0，所以調整一下

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = neighbors.KNeighborsClassifier()  # 沒給 n_neighbors = K值，預設=5
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)

print('accuracy: {}'.format(accuracy))
print('number of correct sample: {}'.format(num_correct_samples))

       0      1       2      3      4      5      6  7
0  15.26  14.84  0.8710  5.763  3.312  2.221  5.220  1
1  14.88  14.57  0.8811  5.554  3.333  1.018  4.956  1
2  14.29  14.09  0.9050  5.291  3.337  2.699  4.825  1
3  13.84  13.94  0.8955  5.324  3.379  2.259  4.805  1
4  16.14  14.99  0.9034  5.658  3.562  1.355  5.175  1
accuracy: 0.9285714285714286
number of correct sample: 39


# Decision Tree  決策樹 (找到切入的點)
ref: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

## Install graphviz to draw the tree
`pip install graphviz`   
`conda install python-graphviz`

- 鳶尾花資料集 來源： (User Guide 可以看欄位簡介)
- https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html
1. Classes：3  (分3種花)
2. Samples per class：50  (每一種類50筆)
3. Samples total：150 (共150筆)
4. Dimensionality：4 (維度4，每一筆特徵有4個：花萼長寬.花蕊長寬)
5. 目標：用前面4欄的特性，去看應該是屬於哪一類別的花

In [16]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import tree
import graphviz 

import os
#把 graphviz的執行檔 匯入路徑裡
os.environ["PATH"] += os.pathsep + './graphviz-2.38/bin'

iris = load_iris()

X = iris.data 
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = DecisionTreeClassifier(max_depth=3) #事前 設定 樹最多長3層
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)

y_pred = model.predict(X_test)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
accuracy = accuracy_score(y_test, y_pred)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample:',num_correct_samples)
print('accuracy:',accuracy)
print('con_matrix:',con_matrix)

# output tree structure 畫出樹圖
dot_data = tree.export_graphviz(model, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("iris")   #會在資料夾裡 有Iris PDF


number of correct sample: 30
accuracy: 1.0
con_matrix: [[16  0  0]
 [ 0  7  0]
 [ 0  0  7]]


'iris.pdf'

# Naive Bayes  樸素貝葉斯 (機率)
ref:  
http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html  
http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [22]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()

X = iris.data 
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#----------兩種方法--------------
# 1. 高斯常態分布
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
model = GaussianNB() 

# 2. Multinomial Naive Bayes 多項式樸素貝葉斯  課本的第一種
# scaler = preprocessing.MinMaxScaler().fit(X_train)  
# X_train = scaler.transform(X_train)
# model = MultinomialNB() 

#--------------------------------

model.fit(X_train, y_train)

X_test = scaler.transform(X_test)

y_pred = model.predict(X_test)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
accuracy = accuracy_score(y_test, y_pred)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample:',num_correct_samples)
print('accuracy:',accuracy)
print('con_matrix:',con_matrix)

number of correct sample: 29
accuracy: 0.9666666666666667
con_matrix: [[ 9  0  0]
 [ 0 11  1]
 [ 0  0  9]]


# Random Forest  (多棵決策樹)
ref: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import tree
import graphviz 

import os
#把 graphviz的執行檔 匯入路徑裡
os.environ["PATH"] += os.pathsep + './graphviz-2.38/bin'

iris = load_iris()
X = iris.data 
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

                              #每棵樹深度6層   要有10棵決策樹
model = RandomForestClassifier(max_depth=6, n_estimators=10)
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample:',num_correct_samples)
print('accuracy:',accuracy)
print('con_matrix:',con_matrix)


#畫出多棵樹圖
for i_tree, tree_in_forest in enumerate(model.estimators_):
    dot_data = tree.export_graphviz(tree_in_forest, out_file = None)
    graph = graphviz.Source(dot_data) 
    graph.render('./random_forest_plot/tree_' + str(i_tree)) 

number of correct sample: 30
accuracy: 1.0
con_matrix: [[10  0  0]
 [ 0  8  0]
 [ 0  0 12]]


# SVM  (Support Vector Machine) 支撐向量機
ref: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

- 胸腔的癌症資料集 來源： (點開User Guide 有欄位介紹)
- https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html
1. Classes：兩個類別 
2. Samples per class：[M類別]212筆、[B類別]357筆，共569筆資料          
3. Dimensionality：30種特徵
4. 目標：用這些特徵去預測 是 M/B類癌症

In [5]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

cancer = load_breast_cancer()

X = cancer.data 
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# 講義p.58有三種kernel類型 rbf, poly, linear，可以更換
model = SVC(kernel='rbf')
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample:',num_correct_samples)
print('accuracy:',accuracy)
print('con_matrix:',con_matrix)

number of correct sample: 110
accuracy: 0.9649122807017544
con_matrix: [[41  3]
 [ 1 69]]
