1.2 Data load

In [61]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris_dataset = load_iris()

print("Target names:", iris_dataset['target_names'])
print("Feature names:\n", iris_dataset['feature_names'])
print("Type of data:", type(iris_dataset['data']))
print("Shape of data:", iris_dataset['data'].shape)
print("=============================")
print("Type of target:", type(iris_dataset['target']))
print("Shape of target:", iris_dataset['target'].shape)

Target names: ['setosa' 'versicolor' 'virginica']
Feature names:
 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Type of data: <class 'numpy.ndarray'>
Shape of data: (150, 4)
Type of target: <class 'numpy.ndarray'>
Shape of target: (150,)


1.3 Data preprocessing

In [62]:
X_train_and_valid, X_test, y_train_and_valid, y_test = train_test_split(
    iris_dataset['data'], iris_dataset['target'], test_size=0.2, random_state=0)
print("X_train_and_valid shape:", X_train_and_valid.shape)
print("y_train_and_valid shape:", y_train_and_valid.shape)
print("=============================")
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train_and_valid shape: (120, 4)
y_train_and_valid shape: (120,)
X_test shape: (30, 4)
y_test shape: (30,)


In [63]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_and_valid, y_train_and_valid, test_size=0.2, random_state=0)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("=============================")
print("X_valid shape:", X_valid.shape)
print("y_valid shape:", y_valid.shape)

X_train shape: (96, 4)
y_train shape: (96,)
X_valid shape: (24, 4)
y_valid shape: (24,)


In [64]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scale=scaler.transform(X_train)
X_valid_scale=scaler.transform(X_valid)
X_test_scale=scaler.transform(X_test)

1.4 KNN and select hyperparameter

In [65]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

neighbors_settings = list(range(1, 31))
p_settings = list(range(1,6))
data_dict=dict()

def weight_func(weight):
    p_dict=dict()
    for p in p_settings:
        training_accuracy=[]
        valid_accuracy=[]
        for n_neighbors in neighbors_settings:
            # build the model
            knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric='minkowski', p=p, weights=weight)
            knn.fit(X_train_scale, y_train)

            y_train_hat = knn.predict(X_train_scale)
            training_accuracy.append(accuracy_score(y_train, y_train_hat))

            y_valid_hat = knn.predict(X_valid_scale)
            valid_accuracy.append(accuracy_score(y_valid, y_valid_hat))
        p_dict[p]=dict(training=training_accuracy, valid=valid_accuracy)
        data_dict[weight]=p_dict

weight_func('uniform')
weight_func('distance')
                                  
data_dict
                                  
# plt.plot(neighbors_settings, training_accuracy, label="training accuracy")
# plt.plot(neighbors_settings, valid_accuracy, label="valid accuracy")
# plt.ylabel("Accuracy")
# plt.xlabel("n_neighbors")
# plt.title("uniform and Minkowski")
# plt.legend()

{'uniform': {1: {'training': [1.0,
    0.9583333333333334,
    0.96875,
    0.9791666666666666,
    0.9791666666666666,
    0.96875,
    0.9583333333333334,
    0.9583333333333334,
    0.9791666666666666,
    0.96875,
    0.96875,
    0.9583333333333334,
    0.9583333333333334,
    0.9583333333333334,
    0.9479166666666666,
    0.9583333333333334,
    0.9375,
    0.9375,
    0.9375,
    0.9479166666666666,
    0.9479166666666666,
    0.9479166666666666,
    0.9479166666666666,
    0.9479166666666666,
    0.9479166666666666,
    0.9375,
    0.9375,
    0.9375,
    0.9479166666666666,
    0.9375],
   'valid': [0.9166666666666666,
    0.9166666666666666,
    0.8333333333333334,
    0.875,
    0.875,
    0.9166666666666666,
    0.9166666666666666,
    0.9166666666666666,
    0.9166666666666666,
    0.9166666666666666,
    0.9166666666666666,
    0.9166666666666666,
    0.9166666666666666,
    0.9166666666666666,
    0.9166666666666666,
    0.9166666666666666,
    0.875,
    0.916666666666

In [66]:
print(data_dict['uniform'][1]['training'][1])

0.9583333333333334


In [67]:
import pandas as pd

data=dict()


for p in p_settings:
    for n_neighbors in neighbors_settings:
        if data.get('weights'):
            data['weights'].append('uniform')
            data['metric'].append(p)
            data['k'].append(n_neighbors)
            data['training_accuracy'].append(data_dict['uniform'][p]['training'][n_neighbors-1])
            data['valid_accuracy'].append(data_dict['uniform'][p]['valid'][n_neighbors-1])
        else:
            data['weights']=['uniform']
            data['metric']=[p]
            data['k']=[n_neighbors]
            data['training_accuracy']=[data_dict['uniform'][p]['training'][n_neighbors-1]]
            data['valid_accuracy']=[data_dict['uniform'][p]['valid'][n_neighbors-1]]

for p in p_settings:
    for n_neighbors in neighbors_settings:
            data['weights'].append('distance')
            data['metric'].append(p)
            data['k'].append(n_neighbors)
            data['training_accuracy'].append(data_dict['distance'][p]['training'][n_neighbors-1])
            data['valid_accuracy'].append(data_dict['distance'][p]['valid'][n_neighbors-1])

df=pd.DataFrame(data, index=list(range(0,len(neighbors_settings)*len(p_settings)*2)), columns=['weights','metric','k','training_accuracy','valid_accuracy'])
df

Unnamed: 0,weights,metric,k,training_accuracy,valid_accuracy
0,uniform,1,1,1.000000,0.916667
1,uniform,1,2,0.958333,0.916667
2,uniform,1,3,0.968750,0.833333
3,uniform,1,4,0.979167,0.875000
4,uniform,1,5,0.979167,0.875000
5,uniform,1,6,0.968750,0.916667
6,uniform,1,7,0.958333,0.916667
7,uniform,1,8,0.958333,0.916667
8,uniform,1,9,0.979167,0.916667
9,uniform,1,10,0.968750,0.916667


In [68]:
m=df['valid_accuracy'].max()
max_index=[i for i, j in enumerate(list(df['valid_accuracy'])) if j == m]
print(max_index, m)

[81, 109, 111] 1.0


In [69]:
df.loc[max_index]

Unnamed: 0,weights,metric,k,training_accuracy,valid_accuracy
81,uniform,3,22,0.9375,1.0
109,uniform,4,20,0.916667,1.0
111,uniform,4,22,0.895833,1.0


In [70]:
knn = KNeighborsClassifier(n_neighbors=22, metric='minkowski', p=3, weights='uniform')
knn.fit(X_train_scale,y_train)
y_test_hat = knn.predict(X_test_scale)
print("test accuracy : ", accuracy_score(y_test, y_test_hat))

test accuracy :  0.9
