In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.set_printoptions(precision=5)

import pandas as pd
pd.set_option('display.precision', 5)

In [2]:
import mglearn
X, y = mglearn.datasets.make_forge()

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [4]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

In [5]:
y_test_hat = clf.predict(X_test)
print(y_test)
print(y_test_hat)

[1 0 1 0 1 1 0]
[1 0 1 0 1 0 0]


In [6]:
from sklearn.metrics import accuracy_score
y_train_hat = clf.predict(X_train)
print('train accuracy: %.5f'%accuracy_score(y_train, y_train_hat))
y_test_hat = clf.predict(X_test)
print('test accuracy: %.5f'%accuracy_score(y_test, y_test_hat))

train accuracy: 0.94737
test accuracy: 0.85714


In [7]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=66)

In [8]:
training_accuracy = []
test_accuracy = []

k_settings = range(1, 11) # try n_neighbors from 1 to 10
for k in k_settings:
    # build the model
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train, y_train)
    
    # accuracy on the training set
    y_train_hat = clf.predict(X_train)
    training_accuracy.append(accuracy_score(y_train, y_train_hat))
    
    # accuracy on the test set (generalization)
    y_test_hat = clf.predict(X_test)
    test_accuracy.append(accuracy_score(y_test, y_test_hat))

In [9]:
pd.DataFrame({'k': k_settings,
              'training accuracy': training_accuracy,
              'test accuracy': test_accuracy}
            )

Unnamed: 0,k,training accuracy,test accuracy
0,1,1.0,0.9021
1,2,0.97653,0.88811
2,3,0.95775,0.92308
3,4,0.9554,0.92308
4,5,0.94836,0.92308
5,6,0.94601,0.93706
6,7,0.94366,0.93007
7,8,0.94131,0.93007
8,9,0.93427,0.91608
9,10,0.93897,0.91608


In [10]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=66)

In [11]:
training_accuracy = []
test_accuracy = []

p_settings = range(1, 6) # try minkowski p from 1 to 5
for p in p_settings:
    # build the model
    clf = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=p)
    clf.fit(X_train, y_train)
    
    # accuracy on the training set
    y_train_hat = clf.predict(X_train)
    training_accuracy.append(accuracy_score(y_train, y_train_hat))
    
    # accuracy on the test set (generalization)
    y_test_hat = clf.predict(X_test)
    test_accuracy.append(accuracy_score(y_test, y_test_hat))

In [12]:
pd.DataFrame({'p': p_settings,
              'training accuracy': training_accuracy,
              'test accuracy': test_accuracy}
            )

Unnamed: 0,p,training accuracy,test accuracy
0,1,0.96479,0.93706
1,2,0.94836,0.92308
2,3,0.94366,0.93007
3,4,0.94366,0.92308
4,5,0.94366,0.92308


In [13]:
import mglearn
X, y = mglearn.datasets.make_wave(n_samples=40)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [15]:
from sklearn.neighbors import KNeighborsRegressor
reg = KNeighborsRegressor(n_neighbors=3)
reg.fit(X_train, y_train)

In [16]:
y_test_hat = reg.predict(X_test)
print(y_test)
print(y_test_hat)

[ 0.37299  0.21778  0.96695 -1.38774 -1.0598  -0.90497  0.43656  0.77896
 -0.54115 -0.95652]
[-0.05397  0.35686  1.13672 -1.89416 -1.13881 -1.63113  0.35686  0.91241
 -0.4468  -1.13881]


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print('MAE: %.5f'%mean_absolute_error(y_test,y_test_hat))
print('RMSE: %.5f'%mean_squared_error(y_test,y_test_hat)**0.5)
print('R_square: %.5f'%r2_score(y_test,y_test_hat))

MAE: 0.25372
RMSE: 0.32966
R_square: 0.83442


In [18]:
pd.DataFrame({'X': X.ravel(),
              'Y': y}
            )

Unnamed: 0,X,Y
0,-0.75276,-0.44822
1,2.70429,0.33123
2,1.39196,0.77932
3,0.59195,0.03498
4,-2.06389,-1.38774
5,-2.06403,-2.47196
6,-2.6515,-1.52731
7,2.19706,1.49417
8,0.60669,1.00032
9,1.24844,0.22956


In [19]:
import mglearn
X, y = mglearn.datasets.make_forge()

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train_scaled, y_train)

In [23]:
from sklearn.metrics import accuracy_score
y_train_hat = clf.predict(X_train_scaled)
print('train accuracy: %.5f'%accuracy_score(y_train, y_train_hat))
y_test_hat = clf.predict(X_test_scaled)
print('test accuracy: %.5f'%accuracy_score(y_test, y_test_hat))

train accuracy: 0.94737
test accuracy: 0.85714


In [24]:
pd.DataFrame({'X1_train': X_train[:10,0],
              'X2_train': X_train[:10,1],
              'X1_train_scaled': X_train_scaled[:10,0],
              'X2_train_scaled': X_train_scaled[:10,1]}
            )

Unnamed: 0,X1_train,X2_train,X1_train_scaled,X2_train_scaled
0,8.9223,-0.63993,-0.43383,-1.66209
1,8.73371,2.49162,-0.61218,-0.21838
2,9.32298,5.09841,-0.05489,0.98339
3,7.99815,4.85251,-1.30782,0.87003
4,11.03295,-0.16817,1.56227,-1.4446
5,9.17748,5.09283,-0.1925,0.98082
6,11.56396,1.33894,2.06445,-0.74979
7,9.15072,5.49832,-0.2178,1.16776
8,8.3481,5.13416,-0.97686,0.99988
9,11.93027,4.64866,2.41088,0.77605


In [25]:
import mglearn
X, y = mglearn.datasets.make_forge()

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [27]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train_scaled, y_train)

In [29]:
from sklearn.metrics import accuracy_score
y_train_hat = clf.predict(X_train_scaled)
print('train accuracy: %.5f'%accuracy_score(y_train, y_train_hat))
y_test_hat = clf.predict(X_test_scaled)
print('test accuracy: %.5f'%accuracy_score(y_test, y_test_hat))

train accuracy: 0.94737
test accuracy: 0.85714


In [30]:
pd.DataFrame({'X1_train': X_train[:10,0],
              'X2_train': X_train[:10,1],
              'X1_train_scaled': X_train_scaled[:10,0],
              'X2_train_scaled': X_train_scaled[:10,1]}
            )

Unnamed: 0,X1_train,X2_train,X1_train_scaled,X2_train_scaled
0,8.9223,-0.63993,0.23502,0.0
1,8.73371,2.49162,0.18706,0.51017
2,9.32298,5.09841,0.33693,0.93485
3,7.99815,4.85251,0.0,0.89479
4,11.03295,-0.16817,0.7718,0.07686
5,9.17748,5.09283,0.29992,0.93394
6,11.56396,1.33894,0.90684,0.32238
7,9.15072,5.49832,0.29312,1.0
8,8.3481,5.13416,0.089,0.94067
9,11.93027,4.64866,1.0,0.86158
