In [91]:
import sys
sys.path.append("../")
from backpain_helper import BackpainHelper
from sklearn import datasets, neighbors, metrics,grid_search, model_selection,cross_validation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from pandas.io import sql
import sqlite3
%matplotlib inline


In [92]:
bh = BackpainHelper()
df = bh.get_spine_data()

columns = ['pelvic_incidence', 'pelvic_tilt','lumbar_lordosis_angle','sacral_slope','pelvic_radius','degree_spondylolisthesis','pelvic_slope','direct_tilt','thoracic_slope','cervical_tilt','sacrum_angle','scoliosis_slope']

### Best Score from range 1 - 50 of neighbors

In [136]:
x = [i for i in range(1, 50)]
kf = cross_validation.KFold(len(df), n_folds = 5, shuffle=True)
gs = grid_search.GridSearchCV(
    estimator=neighbors.KNeighborsClassifier(),
    param_grid={'n_neighbors': x, 'weights':['uniform', 'distance']},
    cv=kf
)
gs.fit(df[columns], df.classification)
print gs.best_estimator_
print gs.best_score_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='distance')
0.841935483871


In [137]:
# from sklearn.tree import export_graphviz
# from os import system 
# def build_tree_image(model):
#     dotfile = open("tree.dot", 'w')
#     export_graphviz(model,
#                               out_file = dotfile,
#                               feature_names = columns)
#     dotfile.close()
#     system("dot -Tpng tree.dot -o tree.png")
    

model = DecisionTreeClassifier(
                max_depth = 16,
                min_samples_leaf = 5)

model.fit(df[columns], df.classification)
build_tree_image(model)

### Random Forest with 5 neighbors and distance as a weight

In [140]:
model = RandomForestClassifier(n_estimators = 20)
    
model.fit(df[columns], df.classification)

kf = cross_validation.KFold(len(df), n_folds = 5, shuffle=True)
gs = grid_search.GridSearchCV(
    estimator=neighbors.KNeighborsClassifier(),
    param_grid={'n_neighbors': [5], 'weights':['distance']},
    cv=kf
)

gs.fit(df[columns], df.classification)
# print gs.grid_scores_
print gs.best_score_

0.81935483871


## Logistic Regression with logspace -10 -> 10, 21 points

In [142]:
logistic = LogisticRegression()
logistic.fit(df[columns], df.classification)

alphas = np.logspace(-10,10,21)
kf = cross_validation.KFold(len(df), n_folds = 5, shuffle=True)
gs = grid_search.GridSearchCV(
    estimator=LogisticRegression(),
    param_grid={'C': alphas},
    cv=kf
)
gs.fit(df[columns], df.classification)
print gs.grid_scores_
print gs.best_estimator_
print gs.best_score_

[mean: 0.67742, std: 0.07967, params: {'C': 1e-10}, mean: 0.67742, std: 0.07967, params: {'C': 1.0000000000000001e-09}, mean: 0.67742, std: 0.07967, params: {'C': 1e-08}, mean: 0.67742, std: 0.07967, params: {'C': 9.9999999999999995e-08}, mean: 0.67742, std: 0.07967, params: {'C': 9.9999999999999995e-07}, mean: 0.67419, std: 0.08122, params: {'C': 1.0000000000000001e-05}, mean: 0.75484, std: 0.05141, params: {'C': 0.0001}, mean: 0.80645, std: 0.04997, params: {'C': 0.001}, mean: 0.82903, std: 0.04630, params: {'C': 0.01}, mean: 0.83226, std: 0.04630, params: {'C': 0.10000000000000001}, mean: 0.83871, std: 0.04785, params: {'C': 1.0}, mean: 0.83548, std: 0.04719, params: {'C': 10.0}, mean: 0.84516, std: 0.03321, params: {'C': 100.0}, mean: 0.84839, std: 0.03474, params: {'C': 1000.0}, mean: 0.84516, std: 0.03898, params: {'C': 10000.0}, mean: 0.84516, std: 0.03321, params: {'C': 100000.0}, mean: 0.84516, std: 0.03898, params: {'C': 1000000.0}, mean: 0.84516, std: 0.03321, params: {'C': 

### KNN Classifier using nested grid search

In [143]:
df = bh.get_spine_data()
X_data = df[columns]
y_data = df.classification
model = neighbors.KNeighborsClassifier()
param_grid = {'n_neighbors': [5], 'weights': ['uniform']}
print bh.nested_cross_val(model, X_data, y_data, param_grid, 4, 50).mean()

0.82285048285


### Random Forest Classifier using nested grid search

In [64]:
model = RandomForestClassifier(n_estimators = 20)
param_grid = {}
print bh.nested_cross_val(model, X_data, y_data, param_grid, 4, 50).mean()

0.812977022977


### Logistic Regression using nested grid search and log space

In [114]:
model = LogisticRegression()
alphas = np.logspace(-10,10,21)
param_grid = {'C':alphas}
print bh.nested_cross_val(model, X_data, y_data, param_grid, 4, 50).mean()

0.836576756577


### Standard Scaler Logistic Regression Pipeline using grid search

In [135]:
scaler = StandardScaler()
logistic_model = LogisticRegression()
modeling_pipe = Pipeline([('scale',scaler),('model',logistic_model)])
modeling_pipe.set_params(model__C = 1)
print bh.nested_cross_val(modeling_pipe, X_data, y_data, {}, 4, 50).mean()

0.83801032301


In [176]:
kf = cross_validation.KFold(len(df), n_folds = 5, shuffle=True)
gs = grid_search.GridSearchCV(
    estimator=modeling_pipe,
    param_grid={},
    cv=kf
)
gs.fit(df[columns], df.classification)
# print gs.grid_scores_
# print gs.best_score_
# print gs.best_estimator_

print gs.get_params()

features = df[columns]
feature_importances = np.absolute(gs.best_estimator_.model.coef_)[0]
print feature_importances
features_df = pd.DataFrame({'Features': columns, 'Importance Score': feature_importances})
features_df.sort_values('Importance Score', inplace=True, ascending=False)
features_df.head(12)

{'estimator__scale__with_std': True, 'n_jobs': 1, 'verbose': 0, 'estimator__scale': StandardScaler(copy=True, with_mean=True, with_std=True), 'estimator__model__warm_start': False, 'estimator__steps': [('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))], 'param_grid': {}, 'cv': sklearn.cross_validation.KFold(n=310, n_folds=5, shuffle=True, random_state=None), 'scoring': None, 'estimator__model__penalty': 'l2', 'estimator__model__intercept_scaling': 1, 'estimator__model__random_state': None, 'estimator__model__class_weight': None, 'estimator__model__tol': 0.0001, 'fit_params': {}, 'estimator__scale__with_mean': True, 'refit': True, 'estimator__model__solver': 'liblinear', 'estimator__scale__copy': Tr

AttributeError: 'Pipeline' object has no attribute 'model'

In [132]:
pca = PCA(n_components = .95, svd_solver = 'full')
X_digits_transf = pca.fit_transform(X_train)
print X_digits_transf.shape
print X_digits_transf[1,:]

(207, 7)
[-20.58740471 -19.55593929  -1.11701999 -13.92992031  12.60374062
  -0.57947982   5.79299672]
