# KNN Classification With Train Test Split
-----------------

## Step 1: Import Required Modules

In [4]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## Step 2: Load and understand Data

In [2]:
iris = datasets.load_iris()

In [3]:
X=iris["data"]

In [4]:
y=iris["target"]

## Step 3: Split Data for training and testing

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.3, # train : 105 # test : 45
                                                    random_state=21, # reproduce # seed
                                                    stratify=y) # input data ratio(50:50:50) = train data ratio(35:35:35) = test data ratio(15:15:15)

1. Did you understand stratify ? ans. stratify shuffles the data just like a playing cards so you can train it, equally trained and test
2. Did you understand random_state? 
3. What is Balanced Data ?

## Step 4: Fit The Model

In [6]:
knn = KNeighborsClassifier(n_neighbors=8)

In [7]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                     weights='uniform')

## Step 5: Predict labels of test data

In [8]:
y_pred = knn.predict(X_test)

In [9]:
print("Test set predictions:\n {}".format(y_pred))

Test set predictions:
 [2 1 2 2 1 0 1 0 0 1 0 2 0 2 2 0 0 0 1 0 2 2 2 0 1 1 1 0 0 1 2 2 0 0 1 2 2
 1 1 2 1 1 0 2 1]


In [10]:
y_test

array([2, 2, 2, 2, 1, 0, 1, 0, 0, 1, 0, 2, 0, 1, 2, 0, 0, 0, 1, 0, 2, 2,
       2, 0, 1, 1, 1, 0, 0, 1, 2, 2, 0, 0, 1, 2, 2, 1, 1, 2, 1, 1, 0, 2,
       1])

In [11]:
np.bincount(y_pred)

array([15, 15, 15], dtype=int64)

## Step 6: Accuracy

In [12]:
accuracy_score(y_test,y_pred)

0.9555555555555556

> **or**

In [13]:
print(knn.score(X_test, y_test))

0.9555555555555556


## Step 7: Hyper Parameter Tuning With GridSearchCV 
### gridsearchcv finds out the best n_neighbors(k) value

In [5]:
# Observe all parameters
KNeighborsClassifier?

In [1]:
from sklearn.model_selection import GridSearchCV

In [2]:
GridSearchCV??

> In above exercise , How you know **n_neighbors=6** ? do you have any way to find it ???

In [19]:
param_grid = {'n_neighbors': np.arange(1, 100)}

In [20]:
knn_cv = GridSearchCV(knn, 
                      param_grid, 
                      cv=5)

In [21]:
knn_cv.fit(X, 
           y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=8, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
        

In [None]:
GridSearchCV??

In [23]:
knn_cv.best_params_

{'n_neighbors': 6}

In [24]:
knn_cv.best_score_

0.98

**Exercise 1: In above code change param_grid code  and observe result**

```python 
param_grid = {'n_neighbors': np.arange(1, 50),
              'weights':['uniform','distance'],
              'algorithm':['ball_tree', 'kd_tree', 'brute'] 
             }
```

**Exercise 2: Explore about RandomizedSearchCV it is similar to GridSearchCV**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
RandomizedSearchCV?

> **Some Questions**
1. What is estimator ?
2. When to go for brute,kd tree and ball tree?

In [None]:
KNeighborsClassifier??

In [None]:
train_test_split??

In [None]:
knn_cv??

In [None]:
GridSearchCV??

In [1]:

from sklearn.model_selection import GridSearchCV

In [3]:
GridSearchCV??