In [None]:
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_predict
from sklearn.preprocessing import StandardScaler

from breast_cancer_functions import pick_best_features, how_many_features_do_we_want, Grid_Search_All

%matplotlib inline

In [None]:
conn = sqlite3.connect('breast_cancer.db')
c = conn.cursor()

df = pd.read_sql('''SELECT *
                    FROM cancer''', conn)

# this gets run when I'm done working for the session
conn.close()

In [None]:
# this is the X I will use
all_ = list(df.columns[2:])
X = df[all_]
X = X.assign(const=1)

# make the y out of the diagnosis column, this can be used for all of the dataframes
y = [1 if diag == 'M' else 0 for diag in df.diagnosis]

After running gridsearch on a bunch of different classifiers and expecting models like the SVM to perform better than little ol LogisticRegression and having it not, I decided to Scale the data around 0. This should help the SVM perform better.

In [None]:
# this scales the data around 0 so no one feature takes over
X[X.columns] = StandardScaler().fit_transform(X)

In [None]:
# check features for how useful they are, check my functions file for more indepth explanation
num_features_to_check = X.shape[1]
features_ranking = pick_best_features(X, y, num_features_to_check)

In [None]:
# see how many features I should use, check my functions file for more indepth explanation
results, features = how_many_features_do_we_want(features_ranking, X, y)

In [None]:
for key, value in results.iteritems():
    print key, value

In the case of diagnosing breast cancer false positives are more acceptable than false negatives. If we incorrectly tell someone she has breast cancer we can test again to see if we got it wrong, she's scared for a bit but we don't send her away with cancer. If we incorrectly tell someone she doesn't have breast cancer and she trully does, she walks out thinking she is in the clear while the cancer may be getting worse, not okay.

With that said, looking at the results from testing LinearRegression models with different number of features it looks like it really stops improving at about the top 16 features. After that there is improvement but not much and I haven't even done a train test split or cross validation yet; I'm not modeling anything yet, just figureing out what features I want to use. Basically it's EDA without graphing anything.

So I am going to take the first 13 spots in features: `features[:16]`

In [None]:
# this is the X I will be working with
X = X[features[:16]]

# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
# model with cross validation
model = LogisticRegression()
predicted = cross_val_predict(model, X_train, y_train, cv=5)

tp, tn, fp, fn = 0, 0, 0, 0
for num in zip(np.array(y_train),predicted):
    if num == (1,1):
        tp += 1
    elif num == (1,0):
        fn += 1
    elif num == (0,1):
        fp += 1
    elif num == (0,0):
        tn += 1
        
tp, tn, fp, fn

In [None]:
model.fit(X_train,y_train)
pre = model.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score

print 'Precision: {}'.format(precision_score(y_test, pre, average='binary'))
print 'Recall: {}'.format(recall_score(y_test, pre, average='binary'))
print 'F1 score: {}'.format(f1_score(y_test, pre))

Precision: tp / (tp + fp)

Recall: tp / (tp + fn)

F1 = 2(Precision * Recall) / (Precision + Recall)

So this wasn't a fluke, these 16 features are the ones I will use moving forward. 

features = 'area_worst',
 'concave_points_worst',
 'radius_worst',
 'radius_se',
 'texture_worst',
 'perimeter_worst',
 'concave_points_mean',
 'area_se',
 'concavity_worst',
 'compactness_se',
 'smoothness_worst',
 'area_mean',
 'perimeter_se',
 'compactness_mean',
 'symmetry_worst',
 'radius_mean'

I'm going to perform grid search on every classification model I can think of and compare them tomorrow.

My first run through of grid search through a bunch of different classification models will be, what's a good way to put it, 'shallow'. I will intentionally leave out some hyperparameters that can help a model perform better in the name of speed. Narrow down what model I want to use and then grid search that one with all of the hyperparameters I can think of and really dial it in.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier

models = {'LogisticRegression':LogisticRegression(),
          'RandomForestClassifier':RandomForestClassifier(),
          'ExtraTreesClassifier':ExtraTreesClassifier(),
          'AdaBoostClassifier':AdaBoostClassifier(),
          'GradientBoostingClassifier':GradientBoostingClassifier(),
          'LinearSVC':LinearSVC(),
          'SVC':SVC(),
          'BaggingClassifier':BaggingClassifier(),
          'SGDClassifier':SGDClassifier()
         }

parameters = {'LogisticRegression':{'C':[0.01, 0.1, 1.0, 10.0, 100.0,1000.0]},
              'RandomForestClassifier':{'n_estimators':[16,32,64,128,256]},
              'ExtraTreesClassifier':{'n_estimators':[16,32,64,128,256]},
              'AdaBoostClassifier':{'n_estimators':[16,32,64,128, 150],'learning_rate':[0.5,0.8,1.0,1.2,1.5]},
              'GradientBoostingClassifier':{'n_estimators':[64,128,150,200],'learning_rate':[0.01,0.08,0.1,0.2,0.4]},
              'LinearSVC':{'C':[0.01, 0.05, 0.1, 0.5, 1.0, 10.0, 100.0,1000.0,10000.0]},
              'SVC':{'C':[0.5, 1.0, 10.0, 100.0,1000.0,10000.0]},
              'BaggingClassifier':{'n_estimators':[5, 10, 15, 20, 25, 30, 100, 150, 200]},
              'SGDClassifier':{'alpha':[0.000001, 0.00001, 0.0001, 0.001, 0.01]}
             }

In [None]:
first_grid_search = Grid_Search_All(models,parameters)
first_grid_search.fit(X_train, y_train)

In [None]:
first_grid_search.score_summary()

## It's looking like I'm going with LinearSVC(C=0.05)

### After running grid search on all of the classifiers above LinearSVC was the winner but what the penalty parameter 'C' should be still needs to be grid searched more.

#### A few notes, this is a small data set, less than 600. Because of this train test splits of the data make an obvious difference, there can be a large variance between different runs of train test splits, LinearSVC was not always the clear winner, it was the winner on average. In fact if I had gone with the first run though of grid search I would have been using LogisticRegression.

#### Because of this I need to think about how I want to implement my final model, as of right now I think it will be LinearSVC and then I will run a bunch of different random train test splits and then average the results...I'll keep thinking on it.

In [None]:
from sklearn.svm import LinearSVC

models2 = {'LinearSVC':LinearSVC()}

parameters2 = {'LinearSVC':{'C':[0.01]
                           }
              }

# parameters2 = {'LinearSVC':{'C':[0.01, 0.05, 0.1],
#                             'tol':[0.00001, 0.0001, 0.001, 0.01],
#                             'max_iter':[500, 750, 1000, 1250, 1500],
#                             'loss':['hinge','squared_hinge']
#                            }
#               }

In [None]:
second_grid_search = Grid_Search_All(models2,parameters2)
second_grid_search.fit(X_train, y_train)

In [None]:
second_grid_search.score_summary()

#### So I started grid search for LinearSVC with a bunch of hyperparameters and was able to narrow it down to only needing one, 'C' = 0.01 gives the most consistently high results

## Starting final modeling

In [None]:
# this is the X I will be working with
X = X[features[:16]]

# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

from sklearn.svm import LinearSVC

model = LinearSVC(C=0.01).fit(X_train,y_train)
predict = model.predict(X_test)

### Remember, I'm most concerned with False Negatives and Recall because Recall is about the false negatives.

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print 'Precision: {}'.format(precision_score(y_test, predict, average='binary'))
print 'Recall: {}'.format(recall_score(y_test, predict, average='binary'))
print 'F1 score: {}'.format(f1_score(y_test, predict))

In [None]:
tp, tn, fp, fn = 0, 0, 0, 0
for num in zip(np.array(y_test),predict):
    if num == (1,1):
        tp += 1
    elif num == (1,0):
        fn += 1
    elif num == (0,1):
        fp += 1
    elif num == (0,0):
        tn += 1
        
print (tp, tn, fp, fn)

### So...okay cool this works well. But I want to set my threshold at a different spot so that I skew the results away from False Negatives, only one problem, LinearSVC doesn't have a predict_proba method like LogisticRegression does, it only predicts the class, now what?

### Turns out that if you take a look at the code for sklearn on GitHub LinearSVC's predict method uses a self.decision_function(X) that it uses to make the class prediction. It is simply an array that the predict method uses to determine the classes, it simply assigns a 1 to the index if the number is greater than 0 and a 0 for everthing else.

### So all I have to do is manually change the threshold from a 0 to a number less than zero and the False Negative rate should fall...Neat.

In [None]:
scores = model.decision_function(X_test)
indices = (scores > -0.1).astype(np.int)

In [None]:
print 'Precision: {}'.format(precision_score(y_test, indices, average='binary'))
print 'Recall: {}'.format(recall_score(y_test, indices, average='binary'))
print 'F1 score: {}'.format(f1_score(y_test, indices))

In [None]:
tp, tn, fp, fn = 0, 0, 0, 0
for num in zip(np.array(y_test),indices):
    if num == (1,1):
        tp += 1
    elif num == (1,0):
        fn += 1
    elif num == (0,1):
        fp += 1
    elif num == (0,0):
        tn += 1
        
print (tp, tn, fp, fn)

### Another grid search is in order to find the correct threshold for my tastes.

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score

X = X[features[:16]]


precision, recall, f1 = [],[],[]
for threshold in np.arange(0,2,0.01)*-1:
    pre, re, f = [],[],[]
    for _ in xrange(100):
        # split into training and testing
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

        # model
        model = LinearSVC(C=0.01).fit(X_train,y_train)
        scores = model.decision_function(X_test)
        prediction = (scores > threshold).astype(np.int)

        pre.append(precision_score(y_test, prediction, average='binary'))
        re.append(recall_score(y_test, prediction, average='binary'))
        f.append(f1_score(y_test, prediction))
        
    precision.append(np.array(pre).mean())
    recall.append(np.array(re).mean())
    f1.append(np.array(f).mean())
    
d = {'precision': precision, 'recall': recall, 'f1_score': f1}
results_df = pd.DataFrame(data=d)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(0,2,0.01), results_df.f1_score, label='F1 Score')
plt.plot(np.arange(0,2,0.01), results_df.precision, label='Precision')
plt.plot(np.arange(0,2,0.01), results_df.recall, label='Recall')
plt.legend()
plt.xlim(0,0.6)
plt.ylim(0.8,)
plt.show()

### So -0.13 is the threshold I'm going to use.

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score

X = X[features[:16]]

TP, TN, FP, FN = [],[],[],[]
precision, recall, f1 = [],[],[]
for _ in xrange(100):
    # split into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

    # model
    model = LinearSVC(C=0.01).fit(X_train,y_train)
    scores = model.decision_function(X_test)
    prediction = (scores > -0.13).astype(np.int)
    
    tp, tn, fp, fn = 0, 0, 0, 0
    for num in zip(y_test, prediction):
        if num == (1,1):
            tp += 1
        elif num == (1,0):
            fn += 1
        elif num == (0,1):
            fp += 1
        elif num == (0,0):
            tn += 1

    TP.append(tp)
    TN.append(tn)
    FP.append(fp)
    FN.append(fn)
    precision.append(precision_score(y_test, prediction, average='binary'))
    recall.append(recall_score(y_test, prediction, average='binary'))
    f1.append(f1_score(y_test, prediction))

d = {'true_positive':TP,'true_negative':TN,'false_positive':FP,'false_negative':FN,
     'precision':precision,'recall':recall,'f1_score':f1}
outcome_df = pd.DataFrame(data=d)

In [None]:
print '   Number of test cases: {}'.format(len(y_test))
print 'Average False Positives: {}'.format(outcome_df.false_positive.mean())
print 'Average False Negatives: {}'.format(outcome_df.false_negative.mean())
print 'Maximum False Negatives: {}'.format(outcome_df.false_negative.max())
print '      Average Precision: {}'.format(outcome_df.precision.mean())
print '         Average Recall: {}'.format(outcome_df.recall.mean())
print '      Average F-1 Score: {}'.format(outcome_df.f1_score.mean())

plt.figure(figsize=(10,5))
plt.plot(outcome_df.true_positive, label='True Positive')
plt.plot(outcome_df.true_negative, label='True Negative')
plt.plot(outcome_df.false_positive, label='False Positive')
plt.plot(outcome_df.false_negative, label='False Negative')
plt.legend()
plt.show()

### Features for this data set are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.

### The false negative rate of fine needle aspirations of lumps that can be felt is about 2-4%.

### My model has a false negative rate of about 1.1%.

# Now all that's left to do if I was going to put this into production would be to train the model on the entire data set and then have at it.