In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, precision_score, recall_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style = 'ticks', font_scale = 1.8)
from sklearn.metrics import confusion_matrix, accuracy_score,\
precision_score, recall_score
from sklearn import tree, svm, ensemble
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from IPython.display import Image
import pydotplus 

  from pandas.core import datetools


In [3]:
data = pd.read_csv('cancer_dataset.csv')

In [15]:
def forward_selector_glm(data, response, threshold = 0.2):
    """Generalised linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels generalised linear model
           with an intercept
           selected by forward selection
           evaluated by deviance
           
    Modified from: http://planspace.org/20150423-forward_selection_with_statsmodels/ #af
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 10000.0, 10000.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        

        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.glm(formula, data, family = sm.families.Binomial()).fit().deviance
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort(reverse = True)
        best_new_score, best_candidate = scores_with_candidates.pop()
        if (best_new_score + best_new_score*threshold) < current_score : #  Thresholding
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.glm(formula, data, family = sm.families.Binomial()).fit()
    return model, selected


In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn import tree 
best_accuracy_scores = 0
best_precision_scores = 0
best_recall_scores = 0
best_f1_scores = 0
best_scores_list = []
best_accuracy_params = []
best_precision_params = []
best_recall_params = []
best_f1_params = []

data = pd.read_csv('cancer_dataset.csv')
new_data = data
del new_data['diagnosis']

for state in range(0,20):
    train, test = train_test_split(new_data, test_size = 0.2,random_state = state)
    training, tlist = forward_selector_glm(train,'diagnosis_bin')
    x_train = np.array(train[tlist]) 
    y_train = np.array(train['diagnosis_bin']) 
    x_test = np.array(test[tlist]) 
    y_test  = np.array(test['diagnosis_bin'])
    cancertree = tree.DecisionTreeClassifier(max_depth=2, min_impurity_split=0.2)
    accuracy_scores = cross_val_score(cancertree, x_test, y_test, cv=10, scoring='accuracy')
    precision_scores = cross_val_score(cancertree, x_test, y_test, cv=10, scoring='precision')
    recall_scores = cross_val_score(cancertree, x_test, y_test, cv=10, scoring='recall')
    f1_scores = cross_val_score(cancertree, x_test, y_test, cv=10, scoring='f1')
    if np.mean(accuracy_scores) > best_accuracy_scores:
        best_accuracy_scores = np.mean(accuracy_scores)
        best_accuracy_params = tlist
    if np.mean(precision_scores) > best_precision_scores:
        best_precision_scores = np.mean(precision_scores)
        best_precision_params = tlist
    if np.mean(recall_scores) > best_recall_scores:
        best_recall_scores = np.mean(recall_scores)
        best_recall_params = tlist
    if np.mean(f1_scores) > best_f1_scores:
        best_f1_scores = np.mean(f1_scores)
        best_f1_params = tlist

print "Complete"

Complete


In [21]:
print 'best_accuracy_scores:',best_accuracy_scores
print 'best_accuracy_params:',best_accuracy_params
print 'best_precision_scores:',best_precision_scores
print 'best_precision_params:',best_precision_params
print 'best_recall_scores:',best_recall_scores
print 'best_recall_params:',best_recall_params
print 'best_f1_scores:',best_f1_scores
print 'best_f1_params:',best_f1_params

best_accuracy_scores: 0.939242424242
best_accuracy_params: ['perimeter_worst', 'smoothness_worst', 'texture_mean']
best_precision_scores: 0.975
best_precision_params: ['perimeter_worst', 'smoothness_worst', 'texture_worst', 'radius_se']
best_recall_scores: 0.925
best_recall_params: ['perimeter_worst', 'smoothness_worst', 'texture_mean']
best_f1_scores: 0.927105672106
best_f1_params: ['perimeter_worst', 'smoothness_worst', 'texture_worst', 'area_se']
