In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

nurses = pd.read_csv("nurses.csv")
nurses.head(5)

Unnamed: 0,JobTitle,BasePay,Year
0,Registered Nurse,174044.89,2012
1,Registered Nurse,129121.08,2012
2,Registered Nurse,134022.0,2012
3,Registered Nurse,134022.0,2012
4,Registered Nurse,133533.6,2012


In [65]:
#Drop categorical Feature
nurses.drop(axis=1, columns=['JobTitle'], inplace=True)
nurses

Unnamed: 0,BasePay,Year
0,174044.89,2012
1,129121.08,2012
2,134022.00,2012
3,134022.00,2012
4,133533.60,2012
...,...,...
9191,4176.00,2018
9192,4337.98,2018
9193,4982.40,2018
9194,4048.80,2018


In [66]:
#Check for null values
nurses['BasePay'].isnull().values.any()

True

In [67]:
#Shape before dropping nulls
nurses.shape

(9196, 2)

In [68]:
#drop null values 11 rows dropped
nurses.dropna()

Unnamed: 0,BasePay,Year
0,174044.89,2012
1,129121.08,2012
2,134022.00,2012
3,134022.00,2012
4,133533.60,2012
...,...,...
9191,4176.00,2018
9192,4337.98,2018
9193,4982.40,2018
9194,4048.80,2018


In [69]:
X_train, X_test, y_train, y_test = train_test_split(nurses.BasePay, nurses.Year, random_state=3, test_size=.20)

In [70]:
X_train.shape

(7356,)

In [71]:
X_test.shape

(1840,)

In [72]:
#Display contents in X_train
X_train

3365    103493.39
1670    112427.78
8700    131910.14
6690    150705.56
2740    112672.51
          ...    
8981     99636.40
6400    158202.15
9160     14116.80
1688    109581.67
5994     93715.28
Name: BasePay, Length: 7356, dtype: float64

In [73]:
#Convert to 2d array
X_train = X_train.values.reshape(-1,1)

In [74]:
#Display conversion
X_train

array([[103493.39],
       [112427.78],
       [131910.14],
       ...,
       [ 14116.8 ],
       [109581.67],
       [ 93715.28]])

In [75]:
#Target shape should be 1d array
y_train.shape

(7356,)

In [76]:
#Targets
y_train

3365    2014
1670    2013
8700    2018
6690    2017
2740    2014
        ... 
8981    2018
6400    2017
9160    2018
1688    2013
5994    2016
Name: Year, Length: 7356, dtype: int64

In [77]:
#Fix infinite or nan values
X_train = np.nan_to_num(X_train)

In [78]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X=X_train, y=y_train)

KNeighborsClassifier(n_neighbors=3)

In [79]:
X_test = X_test.values.reshape(-1,1)

In [80]:
X_test = np.nan_to_num(X_test)

In [86]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestCentroid

In [88]:
estimators = {
    'KNeighborsClassifier': knn,
    'SVC': SVC(gamma='scale'),
    'GaussianNB': GaussianNB(),
    'NearestCentroid': NearestCentroid()}


In [89]:
nurse_salaries = nurses.BasePay.values.reshape(-1,1)
nurse_salaries = np.nan_to_num(nurse_salaries)

In [90]:
for estimator_name, estimator_object in estimators.items():
    kfold = KFold(n_splits=10, random_state=11, shuffle=True)
    scores = cross_val_score(estimator=estimator_object,
        X=nurse_salaries, y=nurses.Year, cv=kfold)
    print(f'{estimator_name:>20}: ' +
          f'mean accuracy={scores.mean():.2%}; ' +
          f'standard deviations={scores.std():.2%}')

KNeighborsClassifier: mean accuracy=29.28%; standard deviations=1.41%
                 SVC: mean accuracy=21.42%; standard deviations=1.62%
          GaussianNB: mean accuracy=19.46%; standard deviations=1.71%
     NearestCentroid: mean accuracy=18.46%; standard deviations=1.31%


In [91]:
predicted = knn.predict(X=X_test)

In [92]:
expected = y_test

In [93]:
predicted[:20]

array([2016, 2013, 2016, 2018, 2015, 2013, 2012, 2015, 2012, 2012, 2018,
       2016, 2013, 2015, 2013, 2014, 2018, 2017, 2012, 2015], dtype=int64)

In [94]:
expected[:20]

7938    2018
451     2012
6694    2017
8371    2018
6803    2017
4148    2015
898     2012
4007    2015
6359    2016
4741    2015
702     2012
6549    2017
5680    2016
6136    2016
3052    2014
6947    2017
8107    2018
1238    2012
7417    2017
8485    2018
Name: Year, dtype: int64

In [95]:
#get all of the wrong predictions
wrong = [(p,e) for (p,e) in zip (predicted, expected) if p != e]

In [96]:
#print wrong predictions in tuples  (predicted,expected)
wrong

[(2016, 2018),
 (2013, 2012),
 (2016, 2017),
 (2015, 2017),
 (2013, 2015),
 (2012, 2016),
 (2012, 2015),
 (2018, 2012),
 (2016, 2017),
 (2013, 2016),
 (2015, 2016),
 (2013, 2014),
 (2014, 2017),
 (2017, 2012),
 (2012, 2017),
 (2015, 2018),
 (2013, 2016),
 (2013, 2017),
 (2012, 2014),
 (2012, 2018),
 (2015, 2013),
 (2015, 2017),
 (2012, 2017),
 (2012, 2013),
 (2013, 2018),
 (2013, 2017),
 (2013, 2012),
 (2013, 2018),
 (2015, 2017),
 (2012, 2016),
 (2014, 2012),
 (2012, 2016),
 (2013, 2015),
 (2012, 2013),
 (2015, 2013),
 (2014, 2018),
 (2017, 2018),
 (2013, 2018),
 (2015, 2017),
 (2014, 2012),
 (2018, 2016),
 (2014, 2013),
 (2014, 2016),
 (2013, 2015),
 (2015, 2018),
 (2018, 2014),
 (2018, 2014),
 (2013, 2015),
 (2015, 2017),
 (2013, 2015),
 (2012, 2016),
 (2015, 2012),
 (2012, 2014),
 (2012, 2015),
 (2012, 2014),
 (2014, 2016),
 (2013, 2014),
 (2013, 2015),
 (2013, 2012),
 (2013, 2016),
 (2015, 2017),
 (2013, 2012),
 (2015, 2012),
 (2013, 2014),
 (2018, 2012),
 (2016, 2015),
 (2012, 20

In [97]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_true=expected, y_pred=predicted)

In [98]:
confusion

array([[132,  53,  31,  20,  11,   9,  11],
       [ 64,  88,  33,  18,  12,  17,   9],
       [ 60,  68,  62,  20,  13,  10,  14],
       [ 50,  50,  32,  63,  20,  17,  15],
       [ 65,  50,  29,  34,  56,  26,  23],
       [ 55,  43,  31,  35,  24,  58,  36],
       [ 55,  32,  31,  35,  25,  18,  77]], dtype=int64)

In [99]:
#Determine number of neighbors for highest accuracy
for k in range(1,20, 2):
    kfold = KFold(n_splits=5, random_state=11, shuffle=True)
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(estimator=knn,
        X=nurse_salaries, y=nurses.Year, cv=kfold)
    print(f'k={k:<2}; mean accuracy={scores.mean():.2%}; ' +
             f'standard deviation={scores.std():.2%}')

k=1 ; mean accuracy=28.97%; standard deviation=0.85%
k=3 ; mean accuracy=29.45%; standard deviation=0.75%
k=5 ; mean accuracy=29.98%; standard deviation=1.24%
k=7 ; mean accuracy=30.75%; standard deviation=1.31%
k=9 ; mean accuracy=30.70%; standard deviation=0.72%
k=11; mean accuracy=30.48%; standard deviation=0.81%
k=13; mean accuracy=31.23%; standard deviation=1.03%
k=15; mean accuracy=30.53%; standard deviation=0.96%
k=17; mean accuracy=30.45%; standard deviation=0.58%
k=19; mean accuracy=30.33%; standard deviation=0.81%
