# Top 10 algorithms & methods used by Data Scientists.

<img src="images/top10algorithms.png">
source: kdnuggets.com

# K-nearest Neighbors (knn)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#import decisiontreeclassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
#import logisticregression classifier
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
#import knn classifier
from sklearn.neighbors import KNeighborsClassifier

#for validating your classification model
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Loading data

In [None]:
df=pd.read_csv("data/heartattack_train.csv")
print df.head()

# Data wrangling & ETL: Data cleaningg & transformation

In [None]:
#mappling or replacing
df = df.replace({'2nd_Heart_Attack': 'No'}, {'2nd_Heart_Attack': '0'})
df = df.replace({'2nd_Heart_Attack': 'Yes'}, {'2nd_Heart_Attack': '1'})

In [None]:
#or you can do this to convert object to number
df['2nd_Heart_Attack'] = df['2nd_Heart_Attack'].astype(int)
print df.dtypes

# Exploratory data analysis

In [None]:
# correlation analysis
df.corr()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(), annot=True)

# Model Building & Validation

# k-Nearest Neighbor (KNN) 
This algorithm is quite easy understand. Let's suppose the training data contains two $y$ values: class A and class B. All training data belong to either **class A (orange) or class B (blue)**. 

<img src="http://bdewilde.github.io/assets/images/2012-10-26-knn-concept.png">

When an unknown data (**star**) is given, a KNN algorithm searches the feature space for the k number of data that are closest to the training data. When k=3, the star is close to one orange circle and two blue circles and thus it is classified as **blue**. At k=6, the star will be classified as **orange**.

In [None]:
# declare X variables and y variable




In [None]:
# evaluate the model by splitting into train and test sets & develop knn model (name it as knn)




In [None]:
#Model evaluation without valdation





# Appendix 1: 10 fold cross validation

In [None]:
# evaluate the knn model using 10-fold cross-validation




# Appendix 2: Search for the optimal k value (GridSearch)

In [None]:
# search for an optimal value of K
k_range = range(1, 10)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores.append(np.mean(cross_val_score(knn, X, y, cv=10, scoring='accuracy')))

# plot the K values (x-axis) versus the 10-fold CV score (y-axis)
plt.figure()
plt.plot(k_range, scores)
plt.xlabel('k value')
plt.ylabel('accuracy')

In [None]:
# automatic grid search for an optimal value of K
#http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html

from sklearn.grid_search import GridSearchCV

knn = KNeighborsClassifier()
k_range = range(1, 10)
param_grid = dict(n_neighbors=k_range)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)

# check the results of the grid search
grid.grid_scores_
grid_mean_scores = [result[1] for result in grid.grid_scores_]
plt.figure()
plt.plot(k_range, grid_mean_scores)
plt.show()

In [None]:
print grid.best_score_
print grid.best_params_
print grid.best_estimator_

In [None]:
# develop a new knn model with k = 3




In [None]:
#Model evaluation without valdation





# Appendix 3: Model Evaluation with ROC

In [None]:
# Generate ROC chart 







# Appendix 4. Comparison of Classification Algorithms

http://scikit-learn.org/stable/_images/sphx_glr_plot_classifier_comparison_001.png