In [None]:
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
%matplotlib inline

In [None]:
conn = sqlite3.connect('breast_cancer.db')
c = conn.cursor()

df = pd.read_sql('''SELECT *
                    FROM cancer''', conn)

# this gets run when I'm done working for the session
conn.close()
df.head(2)

In [None]:
# this is the X I will use
all_ = list(df.columns[2:])
X = df[all_]
X.loc[:,'const'] = 1
    

# make the y out of the diagnosis column, this can be used for all of the dataframes
y = [1 if diag == 'M' else 0 for diag in df.diagnosis]

In [None]:
# use this to evaluate what features to use
model = LogisticRegression()

# RFE model used to select features with best signal
rfe = RFE(model, 5)# the best 5 features will be picked
rfe.fit(X,y)

# summary of the selection attributes ranking
print rfe.ranking_

In [None]:
predicted = rfe.predict(X)

In [None]:
# predicted = np.array([1 if pre >= 0.5 else 0 for pre in model_mean.predict(X)])

tp, tn, fp, fn = 0, 0, 0, 0
for num in zip(np.array(y),predicted):
    if num == (1,1):
        tp += 1
    elif num == (1,0):
        fn += 1
    elif num == (0,1):
        fp += 1
    elif num == (0,0):
        tn += 1

print 'Threshold = .5'
print 'True Positive: {}'.format(tp)
print 'True Negative: {}'.format(tn)
print 'False Positive: {}'.format(fp)
print 'False Negative: {}'.format(fn)

In the case of diagnosing breast cancer false positives are more acceptable than false negatives. If we incorrectly tell someone she has breast cancer we can test again to see if we got it wrong, she's scared for a bit but we don't send her away with cancer. If we incorrectly tell someone she doesn't have breast cancer and she trully does, she walks out thinking she is in the clear while the cancer may be getting worse, not okay. I'm going to try threshold values below 0.5.