In [1]:
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.cross_validation import train_test_split, cross_val_predict
from sklearn.preprocessing import StandardScaler

from breast_cancer_functions import pick_best_features, how_many_features_do_we_want, Grid_Search_All

%matplotlib inline

In [2]:
conn = sqlite3.connect('breast_cancer.db')
c = conn.cursor()

df = pd.read_sql('''SELECT *
                    FROM cancer''', conn)

# this gets run when I'm done working for the session
conn.close()

In [3]:
# this is the X I will use
all_ = list(df.columns[2:])
X = df[all_]
X = X.assign(const=1)

# make the y out of the diagnosis column, this can be used for all of the dataframes
y = [1 if diag == 'M' else 0 for diag in df.diagnosis]

After running gridsearch on a bunch of different classifiers and expecting models like the SVM to perform better than little ol LogisticRegression and having it not, I decided to Scale the data around 0. This should help the SVM perform better.

In [4]:
# this scales the data around 0 so no one feature takes over
X[X.columns] = StandardScaler().fit_transform(X)

In [5]:
# check features for how useful they are, check my functions file for more indepth explanation
num_features_to_check = X.shape[1]
features_ranking = pick_best_features(X, y, num_features_to_check)

In [6]:
# see how many features I should use, check my functions file for more indepth explanation
results, features = how_many_features_do_we_want(features_ranking, X, y)

In [7]:
for key, value in results.iteritems():
    print key, value

1 [176, 348, 9, 36]
2 [196, 349, 8, 16]
3 [198, 344, 13, 14]
4 [195, 343, 14, 17]
5 [202, 348, 9, 10]
6 [201, 348, 9, 11]
7 [201, 350, 7, 11]
8 [201, 350, 7, 11]
9 [202, 351, 6, 10]
10 [203, 354, 3, 9]
11 [203, 354, 3, 9]
12 [203, 354, 3, 9]
13 [203, 354, 3, 9]
14 [203, 354, 3, 9]
15 [205, 354, 3, 7]
16 [206, 354, 3, 6]
17 [207, 354, 3, 5]
18 [207, 355, 2, 5]
19 [207, 354, 3, 5]
20 [207, 355, 2, 5]
21 [207, 355, 2, 5]
22 [207, 355, 2, 5]
23 [207, 355, 2, 5]
24 [207, 355, 2, 5]
25 [207, 355, 2, 5]
26 [207, 355, 2, 5]
27 [207, 355, 2, 5]
28 [207, 355, 2, 5]
29 [207, 355, 2, 5]
30 [207, 355, 2, 5]
31 [207, 355, 2, 5]


In the case of diagnosing breast cancer false positives are more acceptable than false negatives. If we incorrectly tell someone she has breast cancer we can test again to see if we got it wrong, she's scared for a bit but we don't send her away with cancer. If we incorrectly tell someone she doesn't have breast cancer and she trully does, she walks out thinking she is in the clear while the cancer may be getting worse, not okay.

With that said, looking at the results from testing LinearRegression models with different number of features it looks like it really stops improving at about the top 16 features. After that there is improvement but not much and I haven't even done a train test split or cross validation yet; I'm not modeling anything yet, just figureing out what features I want to use. Basically it's EDA without graphing anything.

So I am going to take the first 13 spots in features: `features[:16]`

In [367]:
# this is the X I will be working with
X = X[features[:16]]

# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [127]:
# model with cross validation
model = LogisticRegression()
predicted = cross_val_predict(model, X_train, y_train, cv=5)

tp, tn, fp, fn = 0, 0, 0, 0
for num in zip(np.array(y_train),predicted):
    if num == (1,1):
        tp += 1
    elif num == (1,0):
        fn += 1
    elif num == (0,1):
        fp += 1
    elif num == (0,0):
        tn += 1
        
tp, tn, fp, fn

(161, 283, 4, 7)

In [163]:
model.fit(X_train,y_train)
pre = model.predict(X_test)

# tp, tn, fp, fn = 0, 0, 0, 0
# for num in zip(np.array(y_test),pre):
#     if num == (1,1):
#         tp += 1
#     elif num == (1,0):
#         fn += 1
#     elif num == (0,1):
#         fp += 1
#     elif num == (0,0):
#         tn += 1
        
# print (tp, tn, fp, fn)
from sklearn.metrics import precision_score, recall_score, f1_score

print 'Precision: {}'.format(precision_score(y_test, pre, average='binary'))
print 'Recall: {}'.format(recall_score(y_test, pre, average='binary'))
print 'F1 score: {}'.format(f1_score(y_test, pre))

Precision: 0.979166666667
Recall: 1.0
F1 score: 0.989473684211


Precision: tp / (tp + fp)

Recall: tp / (tp + fn)

F1 = 2(Precision * Recall) / (Precision + Recall)

So this wasn't a fluke, these 16 features are the ones I will use moving forward. 

features = 'area_worst',
 'concave_points_worst',
 'radius_worst',
 'radius_se',
 'texture_worst',
 'perimeter_worst',
 'concave_points_mean',
 'area_se',
 'concavity_worst',
 'compactness_se',
 'smoothness_worst',
 'area_mean',
 'perimeter_se',
 'compactness_mean',
 'symmetry_worst',
 'radius_mean'

I'm going to perform grid search on every classification model I can think of and compare them tomorrow.

My first run through of grid search through a bunch of different classification models will be, what's a good way to put it, 'shallow'. I will intentionally leave out some hyperparameters that can help a model perform better in the name of speed. Narrow down what model I want to use and then grid search that one with all of the hyperparameters I can think of and really dial it in.

In [31]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier

models = {'LogisticRegression':LogisticRegression(),
          'RandomForestClassifier':RandomForestClassifier(),
          'ExtraTreesClassifier':ExtraTreesClassifier(),
          'AdaBoostClassifier':AdaBoostClassifier(),
          'GradientBoostingClassifier':GradientBoostingClassifier(),
          'LinearSVC':LinearSVC(),
          'SVC':SVC(),
          'BaggingClassifier':BaggingClassifier(),
          'SGDClassifier':SGDClassifier()
         }

parameters = {'LogisticRegression':{'C':[0.01, 0.1, 1.0, 10.0, 100.0,1000.0]},
              'RandomForestClassifier':{'n_estimators':[16,32,64,128,256]},
              'ExtraTreesClassifier':{'n_estimators':[16,32,64,128,256]},
              'AdaBoostClassifier':{'n_estimators':[16,32,64,128, 150],'learning_rate':[0.5,0.8,1.0,1.2,1.5]},
              'GradientBoostingClassifier':{'n_estimators':[64,128,150,200],'learning_rate':[0.01,0.08,0.1,0.2,0.4]},
              'LinearSVC':{'C':[0.01, 0.05, 0.1, 0.5, 1.0, 10.0, 100.0,1000.0,10000.0]},
              'SVC':{'C':[0.5, 1.0, 10.0, 100.0,1000.0,10000.0]},
              'BaggingClassifier':{'n_estimators':[5, 10, 15, 20, 25, 30, 100, 150, 200]},
              'SGDClassifier':{'alpha':[0.000001, 0.00001, 0.0001, 0.001, 0.01]}
             }

In [32]:
first_grid_search = Grid_Search_All(models,parameters)
first_grid_search.fit(X_train, y_train)

Running GridSearchCV for LinearSVC's.
Running GridSearchCV for LogisticRegression's.
Running GridSearchCV for SGDClassifier's.
Running GridSearchCV for ExtraTreesClassifier's.
Running GridSearchCV for RandomForestClassifier's.
Running GridSearchCV for AdaBoostClassifier's.
Running GridSearchCV for SVC's.
Running GridSearchCV for GradientBoostingClassifier's.
Running GridSearchCV for BaggingClassifier's.


In [33]:
first_grid_search.score_summary()

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,alpha,learning_rate,n_estimators
6,LinearSVC,0.967391,0.984663,1,0.0110851,100,,,
7,LinearSVC,0.967391,0.984663,1,0.0110851,1000,,,
56,SVC,0.967391,0.984663,1,0.0110851,1,,,
14,LogisticRegression,0.978022,0.984639,1,0.00874383,1000,,,
12,LogisticRegression,0.978022,0.984639,1,0.00874383,10,,,
2,LinearSVC,0.978022,0.984639,1,0.00874383,0.1,,,
4,LinearSVC,0.978022,0.984639,1,0.00874383,1,,,
1,LinearSVC,0.978022,0.984639,1,0.00874383,0.05,,,
13,LogisticRegression,0.978022,0.984639,1,0.00874383,100,,,
11,LogisticRegression,0.978022,0.984639,1,0.00874383,1,,,


## It's looking like I'm going with LinearSVC(C=0.05)

### After running grid search on all of the classifiers above LinearSVC was the winner but what the penalty parameter 'C' should be still needs to be grid searched more.

#### A few notes, this is a small data set, less than 600. Because of this train test splits of the data make an obvious difference, there can be a large variance between different runs of train test splits, LinearSVC was not always the clear winner, it was the winner on average. In fact if I had gone with the first run though of grid search I would have been using LogisticRegression.

#### Because of this I need to think about how I want to implement my final model, as of right now I think it will be LinearSVC and then I will run a bunch of different random train test splits and then average the results...I'll keep thinking on it.

In [349]:
from sklearn.svm import LinearSVC

models2 = {'LinearSVC':LinearSVC()}

parameters2 = {'LinearSVC':{'C':[0.01, 0.05, 0.1],
                            'tol':[0.00001, 0.0001, 0.001, 0.01],
                            'max_iter':[500, 750, 1000, 1250, 1500]
                           }
              }
# 'C' default=1.0
# 'tol' default=0.0001
# 'max_iter' default=1000

In [368]:
second_grid_search = Grid_Search_All(models2,parameters2)
second_grid_search.fit(X_train, y_train)

Running GridSearchCV for LinearSVC's.


In [369]:
second_grid_search.score_summary()

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,max_iter,tol
0,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,500,1e-05
11,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,1000,0.01
19,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,1500,0.01
18,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,1500,0.001
17,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,1500,0.0001
16,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,1500,1e-05
15,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,1250,0.01
1,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,500,0.0001
13,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,1250,0.0001
12,LinearSVC,0.966667,0.977973,0.98913,0.00993826,0.01,1250,1e-05
