## Preliminaries

In [1]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score


from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import svm




## Load Preprocessed data and target

In [52]:
X_train = np.loadtxt('data_preproc_train.txt', delimiter=',')
y_train = np.loadtxt('target_train.txt',delimiter=',')
X_test = np.loadtxt('data_preproc_test.txt',delimiter=',')

## Create Decision Tree Using Gini Impurity

K-nearest neighbors classifier (KNN) is a simple and powerful classification learner.

KNN has three basic parts:

yi: The class of an observation (what we are trying to predict in the test data).

Xi: The predictors/IVs/attributes of an observation.

K: A positive number specified by the researcher. K denotes the number of observations closest to a particular observation that define its “neighborhood”. For example, K=2 means that each observation’s has a neighorhood comprising of the two other observations closest to it.

Imagine we have an observation where we know its independent variables Xtest but do not know its class ytest. The KNN learner finds the K other observations that are closest to xtest and uses their known classes to assign a classes to test.

### Train The Learner

This is our big moment. We train a KNN learner using the parameters that an observation’s neighborhood is its three closest neighors. weights = 'uniform' can be thought of as the voting system used. For example, uniform means that all neighbors get an equally weighted “vote” about an observation’s class while weights = 'distance' would tell the learner to weigh each observation’s “vote” by its distance from the observation we are classifying

### Train Model

In [53]:
# Create decision tree classifer object using k=3
clf = neighbors.KNeighborsClassifier(3, weights = 'uniform')
model = clf.fit(X_train, y_train)

### Create Observation To Predict

In [54]:
# Apply the learner to the new, unclassified observation.
model.predict(X_test)

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0.,
       0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1.,
       0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
       1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
       0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0.,
       1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       1., 1., 1., 0., 0.

### View Predicted Probabilities

In [55]:
# View predicted class probabilities for the three classes
model.predict_proba(X_test)
print(np.shape(model.predict(X_test)))
print(model.predict_proba(X_test))

(332,)
[[0.66666667 0.33333333]
 [0.66666667 0.33333333]
 [0.66666667 0.33333333]
 [1.         0.        ]
 [1.         0.        ]
 [0.66666667 0.33333333]
 [0.66666667 0.33333333]
 [1.         0.        ]
 [0.33333333 0.66666667]
 [1.         0.        ]
 [0.66666667 0.33333333]
 [0.         1.        ]
 [0.66666667 0.33333333]
 [0.         1.        ]
 [0.         1.        ]
 [1.         0.        ]
 [0.66666667 0.33333333]
 [0.66666667 0.33333333]
 [0.66666667 0.33333333]
 [0.33333333 0.66666667]
 [0.         1.        ]
 [0.33333333 0.66666667]
 [0.         1.        ]
 [1.         0.        ]
 [0.         1.        ]
 [0.33333333 0.66666667]
 [0.66666667 0.33333333]
 [1.         0.        ]
 [1.         0.        ]
 [0.66666667 0.33333333]
 [0.33333333 0.66666667]
 [0.66666667 0.33333333]
 [0.66666667 0.33333333]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.         1.        ]
 [0.         1.        ]
 [1.         0.        ]
 [0.66666667 0.333

### View The Model’s Score
How good is our trained model compared to our training data?

In [56]:
print("Our model is %.2f%% accurate!" % (model.score(X_train, y_train)*100))

Our model is 86.69% accurate!


### Create Pipeline

In [57]:
# Create standardizer
standardizer = StandardScaler()

# Create KNN classifier
knn = clf

# Create a pipeline that standardizes, then runs logistic regression
pipeline = make_pipeline(standardizer, knn)

### Create k-Fold Cross-Validation

In [58]:
# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

### Conduct k-Fold Cross-Validation

In [59]:
# Do k-fold cross-validation
cv_results = cross_val_score(pipeline, # Pipeline
                             X_train, # Feature matrix
                             y_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

In [60]:
cv_results

array([0.86111111, 0.83333333, 0.81944444, 0.76388889, 0.66197183,
       0.83098592, 0.77464789, 0.73239437, 0.8028169 , 0.77464789])

### Calculate Mean Performance Score

In [61]:
# Calculate mean
cv_results.mean()

0.7855242566510172