In [None]:
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.cross_validation import train_test_split, cross_val_predict
from breast_cancer_functions import pick_best_features, how_many_features_do_we_want
%matplotlib inline

In [None]:
conn = sqlite3.connect('breast_cancer.db')
c = conn.cursor()

df = pd.read_sql('''SELECT *
                    FROM cancer''', conn)

# this gets run when I'm done working for the session
conn.close()

In [None]:
# this is the X I will use
all_ = list(df.columns[2:])
X = df[all_]
X = X.assign(const=1)

# make the y out of the diagnosis column, this can be used for all of the dataframes
y = [1 if diag == 'M' else 0 for diag in df.diagnosis]

In [None]:
# check features for how useful they are, check my functions file for more indepth explanation
num_features_to_check = X.shape[1]
features_ranking = pick_best_features(X, y, num_features_to_check)

In [None]:
# see how many features I should use, check my functions file for more indepth explanation
results, features = how_many_features_do_we_want(features_ranking, X, y)

In [None]:
for key, value in results.iteritems():
    print key, value

In the case of diagnosing breast cancer false positives are more acceptable than false negatives. If we incorrectly tell someone she has breast cancer we can test again to see if we got it wrong, she's scared for a bit but we don't send her away with cancer. If we incorrectly tell someone she doesn't have breast cancer and she trully does, she walks out thinking she is in the clear while the cancer may be getting worse, not okay.

With that said, looking at the results from testing LinearRegression models with different number of features it looks like it really stops improving at about the top 13 features. After that there is improvement but not much and I haven't even done a train test split or cross validation yet; I'm not modeling anything yet, just figureing out what features I want to use. Basically it's EDA without graphing anything.

So I am going to take the first 13 spots in features: `features[:13]`

In [None]:
# this is the X I will be working with
X = X[features[:13]]

# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
# model with cross validation
model = LogisticRegression()
predicted = cross_val_predict(model, X_train, y_train, cv=5)

tp, tn, fp, fn = 0, 0, 0, 0
for num in zip(np.array(y_train),predicted):
    if num == (1,1):
        tp += 1
    elif num == (1,0):
        fn += 1
    elif num == (0,1):
        fp += 1
    elif num == (0,0):
        tn += 1
        
tp, tn, fp, fn

So this wasn't a fluke, these 13 features are the ones I will use moving forward. 

features = 'concavity_worst',
   'concavity_mean',
   'const',
   'radius_worst',
   'radius_mean',
   'compactness_worst',
   'concave_points_worst',
   'concave_points_mean',
   'perimeter_se',
   'perimeter_worst',
   'area_se',
   'texture_se',
   'texture_worst'

I'm going to perform grid search on every classification model I can think of and compare them tomorrow.

My first run through of grid search through a bunch of different classification models will be, what's a good way to put it, 'shallow'. I will intentionally leave out some hyperparameters that can help a model perform better in the name of speed. Narrow down what model I want to use and then grid search that one with all of the hyperparameters I can think of and really dial it in.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC

models = {'LogisticRegression':LogisticRegression(),
          'RandomForestClassifier':RandomForestClassifier(),
          'ExtraTreesClassifier':ExtraTreesClassifier(),
          'AdaBoostClassifier':AdaBoostClassifier(),
          'GradientBoostingClassifier':GradientBoostingClassifier(),
          'LinearSVC':LinearSVC(),
          'SVC':SVC()
         }

parameters = {'LogisticRegression':{'C':[0.01, 0.1, 1.0, 10.0]},
              'RandomForestClassifier':{'n_estimators':[16,32,64,128]},
              'ExtraTreesClassifier':{'n_estimators':[16,32,64,128]},
              'AdaBoostClassifier':{'n_estimators':[16,32,64,128],'learning_rate':[0.8,1.0,1.2]},
              'GradientBoostingClassifier':{'n_estimators':[64,128,150],'learning_rate':[0.08,0.1,0.2]},
              'LinearSVC':{'C':[0.5, 1.0, 10.0, 100.0]},
              'SVC':{'C':[0.5, 1.0, 10.0, 100.0]}
             }

In [None]:
from sklearn.grid_search import GridSearchCV

class Grid_Search_All:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.gridsearches = {}
        
    def fit(self, X, y, cv=5, pre_dispatch=4, refit=False):
        '''
        Fits all of the models with all of the parameter options
        using cross validation.
        
        cv = crossvalidation, default is 5
        pre_dispatch = number of jobs run in parallel, default is 4 because
                       my computer has 4 cores
        refit = whether or not it will fit all data to best model from
                crossvalidation, default is False because I don't need
                it so it would waste time
        '''
        for model_name in self.keys:
            print "Running GridSearchCV for {}'s.".format(model_name)
            model = self.models[model_name]
            par = self.params[model_name]
            
            grid_search = GridSearchCV(model, par, cv=cv, pre_dispatch=pre_dispatch, refit=refit)
            grid_search.fit(X,y)
            
            self.gridsearches[model_name] = grid_search
            
    def score_summary(self, sort_by='mean_score'):
        '''
        This builds and prints a pandas dataframe of the summary of all the
        different fits of the models and orders them by best performing
        in a category that you tell it to.
        '''
        def row(key, scores, params):
            d = {'estimators': key,
                 'min_score': np.min(scores),
                 'max_score': np.max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores)
                }
            return pd.Series(dict(params.items() + d.items()))
        
        rows = []
        for k in self.keys:
            for gsc in self.gridsearches[k].grid_scores_:
                rows.append(row(k, gsc.cv_validation_scores, gsc.parameters))
                
        df = pd.concat(rows, axis=1).T.sort([sort_by], ascending=False)
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns]

In [None]:
first_grid_search = Grid_Search_All(models,parameters)

In [None]:
first_grid_search.fit(X_train, y_train)

In [None]:
first_grid_search.score_summary()