In [1]:
import sys
sys.path.append("../")
from backpain_helper import BackpainHelper
from sklearn import datasets, neighbors, metrics,grid_search, model_selection,cross_validation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from pandas.io import sql
import sqlite3
%matplotlib inline




## Finding the best model

In this notebook, I ran my data through three models KNN, Random Trees, Logistic Regression, and Scaler Logistic Regression with normal cross validation and nested cross validation.  The best fit model was Scaler Logistic Regression with a best fit of 0.839.  The features with the most influence are degree_spondylolisthesis, pelvic_radius, sacral_slope, and pelvic_tilt.  The chart is at the end of the notebook.

I created a helper file to make things resuable and clearer to read in the notebooks.  You can find that here: [backpain_helper.py](https://github.com/nullpointer0x00/ds-sfb/blob/master/backpain/backpain_helper.py)

Also, I was able to find the proper way to extract the coef from a model inside a pipeline.  It was easier to read the properties once I put it in the PyCharms debugger.  

```
gs.best_estimator_.named_steps['model'].coef_
```

### Loading Dataframe from Helper file(this reads from a csv file and loads it into a sqlite database)

In [2]:
bh = BackpainHelper()
df = bh.get_spine_data()

columns = ['pelvic_incidence', 'pelvic_tilt','lumbar_lordosis_angle','sacral_slope','pelvic_radius','degree_spondylolisthesis','pelvic_slope','direct_tilt','thoracic_slope','cervical_tilt','sacrum_angle','scoliosis_slope']

  chunksize=chunksize, dtype=dtype)


### Best Score from range 1 - 50 of neighbors

In [3]:
x = [i for i in range(1, 50)]
kf = cross_validation.KFold(len(df), n_folds = 5, shuffle=True)
gs = grid_search.GridSearchCV(
    estimator=neighbors.KNeighborsClassifier(),
    param_grid={'n_neighbors': x, 'weights':['uniform', 'distance']},
    cv=kf
)
gs.fit(df[columns], df.classification)
print gs.best_estimator_
print gs.best_score_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
0.845161290323


### Random Forest with 5 neighbors and distance as a weight

In [4]:
model = RandomForestClassifier(n_estimators = 20)
    
model.fit(df[columns], df.classification)

kf = cross_validation.KFold(len(df), n_folds = 5, shuffle=True)
gs = grid_search.GridSearchCV(
    estimator=neighbors.KNeighborsClassifier(),
    param_grid={'n_neighbors': [5], 'weights':['distance']},
    cv=kf
)

gs.fit(df[columns], df.classification)
# print gs.grid_scores_
print gs.best_score_

0.825806451613


## Logistic Regression with logspace -10 -> 10, 21 points

In [5]:
logistic = LogisticRegression()
logistic.fit(df[columns], df.classification)

alphas = np.logspace(-10,10,21)
kf = cross_validation.KFold(len(df), n_folds = 5, shuffle=True)
gs = grid_search.GridSearchCV(
    estimator=LogisticRegression(),
    param_grid={'C': alphas},
    cv=kf
)
gs.fit(df[columns], df.classification)
print gs.grid_scores_
print gs.best_estimator_
print gs.best_score_

[mean: 0.67742, std: 0.04206, params: {'C': 1e-10}, mean: 0.67742, std: 0.04206, params: {'C': 1.0000000000000001e-09}, mean: 0.67742, std: 0.04206, params: {'C': 1e-08}, mean: 0.67742, std: 0.04206, params: {'C': 9.9999999999999995e-08}, mean: 0.67742, std: 0.04206, params: {'C': 9.9999999999999995e-07}, mean: 0.67419, std: 0.04493, params: {'C': 1.0000000000000001e-05}, mean: 0.76129, std: 0.06321, params: {'C': 0.0001}, mean: 0.80323, std: 0.02581, params: {'C': 0.001}, mean: 0.82258, std: 0.03817, params: {'C': 0.01}, mean: 0.83226, std: 0.03621, params: {'C': 0.10000000000000001}, mean: 0.83226, std: 0.03621, params: {'C': 1.0}, mean: 0.83548, std: 0.03290, params: {'C': 10.0}, mean: 0.84516, std: 0.02621, params: {'C': 100.0}, mean: 0.84516, std: 0.02621, params: {'C': 1000.0}, mean: 0.84194, std: 0.03128, params: {'C': 10000.0}, mean: 0.84516, std: 0.02621, params: {'C': 100000.0}, mean: 0.84516, std: 0.02621, params: {'C': 1000000.0}, mean: 0.84516, std: 0.02621, params: {'C': 

### KNN Classifier using nested grid search

In [6]:
df = bh.get_spine_data()
X_data = df[columns]
y_data = df.classification
model = neighbors.KNeighborsClassifier()
param_grid = {'n_neighbors': [5], 'weights': ['uniform']}
print bh.nested_cross_val(model, X_data, y_data, param_grid, 4, 50).mean()

0.82285048285


### Random Forest Classifier using nested grid search

In [7]:
model = RandomForestClassifier(n_estimators = 20)
param_grid = {}
print bh.nested_cross_val(model, X_data, y_data, param_grid, 4, 50).mean()

0.815576090576


### Logistic Regression using nested grid search and log space

In [8]:
model = LogisticRegression()
alphas = np.logspace(-10,10,21)
param_grid = {'C':alphas}
print bh.nested_cross_val(model, X_data, y_data, param_grid, 4, 50).mean()

0.836576756577


### Standard Scaler Logistic Regression Pipeline using grid search

In [10]:
scaler = StandardScaler()
logistic_model = LogisticRegression()
modeling_pipe = Pipeline([('scale',scaler),('model',logistic_model)])
modeling_pipe.set_params(model__C = 100)
print bh.nested_cross_val(modeling_pipe, X_data, y_data, {}, 4, 50).mean()

0.839094239094


### Pipeline(Scaler/Logistic Regression) is the best estimator, Determining the important features 

In [11]:
kf = cross_validation.KFold(len(df), n_folds = 5, shuffle=True)
gs = grid_search.GridSearchCV(
    estimator=modeling_pipe,
    param_grid={},
    cv=kf
)
gs.fit(df[columns], df.classification)
features = df[columns]
feature_importances = np.absolute(gs.best_estimator_.named_steps['model'].coef_)[0]
print feature_importances
features_df = pd.DataFrame({'Features': columns, 'Importance Score': feature_importances})
features_df.sort_values('Importance Score', inplace=True, ascending=False)
features_df.head(12)

[ 0.26409788  0.95837093  0.40408272  1.05369244  1.39278081  6.34164027
  0.06066534  0.11645368  0.17568931  0.14291768  0.05918608  0.10179986]


Unnamed: 0,Features,Importance Score
5,degree_spondylolisthesis,6.34164
4,pelvic_radius,1.392781
3,sacral_slope,1.053692
1,pelvic_tilt,0.958371
2,lumbar_lordosis_angle,0.404083
0,pelvic_incidence,0.264098
8,thoracic_slope,0.175689
9,cervical_tilt,0.142918
7,direct_tilt,0.116454
11,scoliosis_slope,0.1018


In [17]:
pca = PCA(n_components = .95, svd_solver = 'full')
X_digits_transf = pca.fit_transform(df[columns])
print X_digits_transf.shape
print X_digits_transf[1,:]

(310, 7)
[-37.41475462  18.98494214  12.7619265   -8.23505044  10.47546277
  -2.19129375   4.29116715]
