# HOMEWORK: k-Nearest Neighbors

In [106]:
import os

import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 100)

from sklearn import preprocessing, neighbors, grid_search, cross_validation
from sklearn import model_selection

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [12]:
df = pd.read_csv('/Users/edwardlee/Desktop/df-sf-32/DS-SF-32/lessons/lesson-8/dataset-boston.csv')

In [13]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


The Boston dataset concerns itself with housing values in suburbs of Boston.  A description of the dataset is as follows:

- CRIM: per capita crime rate by town
- ZN: proportion of residential land zoned for lots over 25,000 sqft
- INDUS: proportion of non-retail business acres per town
- CHAS: Charles River binary/dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX: nitric oxides concentration (parts per 10 million)
- RM: average number of rooms per dwelling
- AGE: proportion of owner-occupied units built prior to 1940
- DIS: weighted distances to five Boston employment centers
- RAD: index of accessibility to radial highways
- TAX: full-value property-tax rate (per ten thousands of dollars)
- PTRATIO: pupil-teacher ratio by town
- B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT: % lower status of the population
- MEDV: Median value of owner-occupied homes (in thousands of dollars)

## Question 1.  
+ Let's first categorize `MEDV` to 4 groups: Bottom 20% as Level 1, next 30% as Level 2, next 30% categorized as Level 3, and the top 20% as Level 4.  
+ Please create a new variable `MEDV_Category` that stores the level number
+ Remember the quantile function
+ Remember how to segment your pandas data frame

In [14]:
print df['MEDV'].quantile(.2)
print df['MEDV'].quantile(.5)
print df['MEDV'].quantile(.8)

15.3
21.2
28.2


In [15]:
q1 = df['MEDV'].quantile(.2)
q2 = df['MEDV'].quantile(.5)
q3 = df['MEDV'].quantile(.8)

def classify_medv(x):
    if x < q1:
        return 0
    elif x >= q1 and x < q2:
        return 1
    elif x >= q2 and x < q3:
        return 2
    elif x >= q3:
        return 3

df['medv_category'] = df['MEDV'].map(classify_medv)

In [16]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV,medv_category
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,2
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,2
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,3
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,3
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,3


### Our goal is to predict `MEDV_Category` based on `RM`, `PTRATIO`, and `LSTAT`

## Question 2.  

+ First normalize `RM`, `PTRATIO`, and `LSTAT`.  
+ By normalizing, we mean to scale each variable between 0 and 1 with the lowest value as 0 and the highest value as 1

+ Check out the documentation for MinMaxScaler()

In [18]:
from sklearn.preprocessing import MinMaxScaler

In [24]:
mms = MinMaxScaler()
df['RM'] = mms.fit_transform(df['RM'])
df['PTRATIO'] = mms.fit_transform(df['PTRATIO'])
df['LSTAT'] = mms.fit_transform(df['LSTAT'])



In [25]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV,medv_category
0,0.00632,18.0,2.31,0,0.538,0.577505,65.2,4.09,1,296,0.287234,396.9,0.08968,24.0,2
1,0.02731,0.0,7.07,0,0.469,0.547998,78.9,4.9671,2,242,0.553191,396.9,0.20447,21.6,2
2,0.02729,0.0,7.07,0,0.469,0.694386,61.1,4.9671,2,242,0.553191,392.83,0.063466,34.7,3
3,0.03237,0.0,2.18,0,0.458,0.658555,45.8,6.0622,3,222,0.648936,394.63,0.033389,33.4,3
4,0.06905,0.0,2.18,0,0.458,0.687105,54.2,6.0622,3,222,0.648936,396.9,0.099338,36.2,3


## Question 3.  

+ Run a k-NN classifier with 5 nearest neighbors and report your misclassification error; set weights to uniform
+ Calculate your misclassification error on the training set

In [42]:
df.shape

(506, 15)

In [26]:
knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [27]:
X = df[['RM', 'PTRATIO', 'LSTAT']]
y = df['medv_category'].values

In [95]:
from sklearn.cross_validation import train_test_split

trainX, testX, trainY, testY = train_test_split(X, y, stratify=y, train_size=.80)
print trainX.shape, testX.shape
print trainY.shape, testY.shape

(404, 3) (102, 3)
(404,) (102,)


In [96]:
model = knn.fit(trainX, trainY)

In [97]:
y_predict = model.predict(testX)

In [98]:
model.score(testX, testY)

0.73529411764705888

In [99]:
from sklearn.metrics import confusion_matrix, classification_report

cnf_mtx = confusion_matrix(testY, y_predict)
print cnf_mtx

print classification_report(y_predict, testY)

[[16  4  0  0]
 [ 1 19 10  0]
 [ 0  7 24  0]
 [ 0  1  4 16]]
             precision    recall  f1-score   support

          0       0.80      0.94      0.86        17
          1       0.63      0.61      0.62        31
          2       0.77      0.63      0.70        38
          3       0.76      1.00      0.86        16

avg / total       0.73      0.74      0.73       102



In [100]:
from sklearn.metrics import accuracy_score

print 'Error Rate', 1 - accuracy_score(testY, y_predict)

Error Rate 0.264705882353


## Question 4. 
+ Is this error reliable? 
+ What could we do to make it better?

In [101]:
y_predict_full = model.predict(X)
model.score(X, y)

0.76877470355731226

In [103]:
cnf_mtx = confusion_matrix(y, y_predict_full)
print cnf_mtx

print classification_report(y_predict_full, y)

[[ 81  20   0   0]
 [ 21 102  27   0]
 [  2  24 120   7]
 [  0   5  11  86]]
             precision    recall  f1-score   support

          0       0.80      0.78      0.79       104
          1       0.68      0.68      0.68       151
          2       0.78      0.76      0.77       158
          3       0.84      0.92      0.88        93

avg / total       0.77      0.77      0.77       506



In [102]:
print 'Error Rate', 1 - accuracy_score(y, y_predict_full)

Error Rate 0.231225296443


<span style='font-size:1.5em; color:blue'>Based off of these 3 predictors, the algorithm on the entire dataset shows a 23% error rate, which isn't great. We will need to perform grid search to optimize our algorithm.</span>

## Question 5.  
+ Now use 10-fold cross-validation to choose the most efficient `k`

In [186]:
params = {
    'n_neighbors':range(2,30),
    'weights':['uniform', 'distance']
}
gs = grid_search.GridSearchCV(knn, params, cv=10, verbose=1)

In [189]:
gs.fit(trainX, trainY)

Fitting 10 folds for each of 56 candidates, totalling 560 fits


[Parallel(n_jobs=1)]: Done 560 out of 560 | elapsed:    2.7s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

## Question 6.  

+ Explain your findings
+ What were your best parameters?
+ What was the best k?
+ What was the best model?

In [190]:
print 'best estimator: ', gs.best_estimator_
print 'best param: ', gs.best_params_
print 'best score: ', gs.best_score_

best estimator:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=18, p=2,
           weights='distance')
best param:  {'n_neighbors': 18, 'weights': 'distance'}
best score:  0.712871287129


## Question 7.  

+ Train your model with the optimal `k` you found above 
+ (don't worry if it changes from time to time - if that is the case use the one that is usually the best)

In [206]:
knn_best = neighbors.KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=18, p=2,
           weights='distance')

In [211]:
model_best = knn_best.fit(trainX, trainY)
predict_best = model_best.predict(testX)
print 'error: ', 1 - model_best.score(testX, testY)

error:  0.28431372549


In [213]:
predict_best_full = model_best.predict(X)
print 'error: ', 1 - model_best.score(X, y)

error:  0.0573122529644


In [214]:
print confusion_matrix(testY, predict_best)
print '=================================================='
print confusion_matrix(y, predict_best_full)

[[16  4  0  0]
 [ 2 17 11  0]
 [ 0  7 24  0]
 [ 0  1  4 16]]
[[ 97   4   0   0]
 [  2 137  11   0]
 [  0   7 146   0]
 [  0   1   4  97]]


In [215]:
print classification_report(testY, predict_best)
print '=================================================='
print classification_report(y, predict_best_full)

             precision    recall  f1-score   support

          0       0.89      0.80      0.84        20
          1       0.59      0.57      0.58        30
          2       0.62      0.77      0.69        31
          3       1.00      0.76      0.86        21

avg / total       0.74      0.72      0.72       102

             precision    recall  f1-score   support

          0       0.98      0.96      0.97       101
          1       0.92      0.91      0.92       150
          2       0.91      0.95      0.93       153
          3       1.00      0.95      0.97       102

avg / total       0.94      0.94      0.94       506



<span style='font-size:1.5em; color:blue'>The grid search performed much better with a 5.7% error rate on the full dataset</span>

## Question 8.  

+ After training your model with that `k`, 
+ use it to *predict* the class of a neighborhood with `RM = 2`, `PRATIO = 19`, and `LSTAT = 3.5`
+ If you are confused, check out the sklearn documentation for KNN

In [218]:
X.columns.tolist()

['RM', 'PTRATIO', 'LSTAT']

In [209]:
model_best.predict([2, 19, 3.5])



array([1])

- RM: average number of rooms per dwelling
- PTRATIO: pupil-teacher ratio by town
- LSTAT: % lower status of the population
- MEDV: Median value of owner-occupied homes (in thousands of dollars)

<span style='font-size:1.5em; color:blue'>With 2 dwelling rooms, a 19:1 pupil:teacher ratio and 3.5% lower status predicts a quantile between 20-50% in median income.</span>