# Big Data Final Assessment

Work on the *cancer_dataset.csv* file to produce predictive results on the state of the tumor.

Tasks to complete:

*See OneNote*

In [2]:
# Import all necessary modules

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.metrics import confusion_matrix, accuracy_score,\
precision_score, recall_score
from sklearn import tree, svm, ensemble
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from IPython.display import Image
import pydotplus 


sns.set(style = 'ticks', font_scale = 1.8)
RANDSEED = 42

np.random.seed(RANDSEED)

  from pandas.core import datetools


In [3]:
# Import dataset
data = pd.read_csv('cancer_dataset.csv')
data.head()


Unnamed: 0,id,diagnosis,diagnosis_bin,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
print "Number of null data points in each column: \n"
print "There should be no null data entries in any column"
print data.isnull().sum()

Number of null data points in each column: 

There should be no null data entries in any column
id                         0
diagnosis                  0
diagnosis_bin              0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave_points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave_points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave_points_worst       0
symme

In [5]:
# Convert diagnosis (B/M) to dummies (0/1 in seperate columns)
# data_clean = pd.get_dummies(data)
# data_clean.head()

We now use this clean and ready dataset to look at some potential correlations between different variables.

In [6]:
glm1 = smf.glm(formula = 'diagnosis_bin~radius_mean', data = data, family = sm.families.Binomial()).fit()
print glm1.summary()

                 Generalized Linear Model Regression Results                  
Dep. Variable:          diagnosis_bin   No. Observations:                  569
Model:                            GLM   Df Residuals:                      567
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -165.01
Date:                Sat, 24 Jun 2017   Deviance:                       330.01
Time:                        14:43:59   Pearson chi2:                     489.
No. Iterations:                     7                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     -15.2459      1.325    -11.509      0.000     -17.842     -12.649
radius_mean     1.0336      0.093     11.100     

In [7]:
for name in data.columns[3:]:
    print name

radius_mean
texture_mean
perimeter_mean
area_mean
smoothness_mean
compactness_mean
concavity_mean
concave_points_mean
symmetry_mean
fractal_dimension_mean
radius_se
texture_se
perimeter_se
area_se
smoothness_se
compactness_se
concavity_se
concave_points_se
symmetry_se
fractal_dimension_se
radius_worst
texture_worst
perimeter_worst
area_worst
smoothness_worst
compactness_worst
concavity_worst
concave_points_worst
symmetry_worst
fractal_dimension_worst


In [8]:
def compute_devs(variables, base_str, level, depth, results):
    
    
    for idx in range(0,len(variables)):
        
        if level == 0:
            form_str = base_str + item
        else:
            form_str = base_str + '+' + item
        
        glm1 = smf.glm(formula = form_str, data = data, family = sm.families.Binomial()).fit()
        
        print [form_str,glm1.deviance]
        results.append([form_str,glm1.deviance])
    
        if level != depth:
            level += 1
            variables.remove(item)
            compute_devs(variables, form_str, level, depth, results)
        else:
            level -= 1
            pass # find min deviance

    return results

        
def compute_lowest_dev(col_heads):
    base_str = 'diagnosis_bin~'
    level = 0
    
    variables = []
    for name in col_heads:
        variables.append(name)
    
    results = []
    results = compute_devs(variables, base_str, level=0, depth=1, results=results)
    print len(results)
    print results
        
        
        
        
col_heads = data.columns[3:]
base_str = 'diagnosis_bin~'
variables = []
for name in col_heads:
    variables.append(name)
for item in variables:
    form_str = base_str + item
    glm1 = smf.glm(formula = form_str, data = data, family = sm.families.Binomial()).fit()
#     print "%-25s" "%s" % (form_str,glm1.deviance)

    variables.remove(item) # remove current var from list
    for item2 in variables:
        form_str = form_str + "+" + item2
        glm2 = smf.glm(formula = form_str, data = data, family = sm.families.Binomial()).fit()
        print "%-25s" "%s" % (form_str,glm2.deviance)
    
    
#compute_lowest_dev( col_heads = data.columns[3:] )

# for item in column
# run glm
# record deviance (low is better)
# for item in data
# run again for every other item added
# if deviance increases, remove it

diagnosis_bin~radius_mean+texture_mean291.123306378
diagnosis_bin~radius_mean+texture_mean+perimeter_mean218.897460989
diagnosis_bin~radius_mean+texture_mean+perimeter_mean+area_mean211.366717363
diagnosis_bin~radius_mean+texture_mean+perimeter_mean+area_mean+smoothness_mean169.223176874
diagnosis_bin~radius_mean+texture_mean+perimeter_mean+area_mean+smoothness_mean+compactness_mean169.180291607
diagnosis_bin~radius_mean+texture_mean+perimeter_mean+area_mean+smoothness_mean+compactness_mean+concavity_mean155.961442564
diagnosis_bin~radius_mean+texture_mean+perimeter_mean+area_mean+smoothness_mean+compactness_mean+concavity_mean+concave_points_mean149.238708777
diagnosis_bin~radius_mean+texture_mean+perimeter_mean+area_mean+smoothness_mean+compactness_mean+concavity_mean+concave_points_mean+symmetry_mean146.776943854
diagnosis_bin~radius_mean+texture_mean+perimeter_mean+area_mean+smoothness_mean+compactness_mean+concavity_mean+concave_points_mean+symmetry_mean+fractal_dimension_mean146.

  t = np.exp(-z)


diagnosis_bin~perimeter_mean+texture_mean+area_mean252.43999521
diagnosis_bin~perimeter_mean+texture_mean+area_mean+smoothness_mean181.206214134
diagnosis_bin~perimeter_mean+texture_mean+area_mean+smoothness_mean+compactness_mean173.959983255
diagnosis_bin~perimeter_mean+texture_mean+area_mean+smoothness_mean+compactness_mean+concavity_mean156.480223935
diagnosis_bin~perimeter_mean+texture_mean+area_mean+smoothness_mean+compactness_mean+concavity_mean+concave_points_mean149.479315553
diagnosis_bin~perimeter_mean+texture_mean+area_mean+smoothness_mean+compactness_mean+concavity_mean+concave_points_mean+symmetry_mean147.104831456
diagnosis_bin~perimeter_mean+texture_mean+area_mean+smoothness_mean+compactness_mean+concavity_mean+concave_points_mean+symmetry_mean+fractal_dimension_mean146.436780082
diagnosis_bin~perimeter_mean+texture_mean+area_mean+smoothness_mean+compactness_mean+concavity_mean+concave_points_mean+symmetry_mean+fractal_dimension_mean+radius_se146.051381202
diagnosis_bin~

In [10]:
# index creator
'''
add to list 0 to 30
add to list 
'''
nums = []
def foo():
    for i in range(0,30):
        nums.append(i)
print nums

[]


In [11]:
def compute_devs(variables, base_str, level, depth, results):
    
    if level != depth:
        for item in variables:

            if level == 0:
                form_str = base_str + item
            else:
                form_str = base_str + '+' + item

            glm1 = smf.glm(formula = form_str, data = data, family = sm.families.Binomial()).fit()

            print [form_str,glm1.deviance]
            results.append([form_str,glm1.deviance])
        
        level += 1
        variables.remove(item)
        compute_devs(variables, form_str, level, depth, results)
    else:
        pass
    
    return results

        
def compute_lowest_dev(col_heads):
    base_str = 'diagnosis_bin~'
    level = 0
    
    variables = []
    for name in col_heads:
        variables.append(name)
    
    results = []
    results = compute_devs(variables, base_str, level=0, depth=2, results=results)
    print len(results)
    print results
        
        
        
        
# col_heads = data.columns[3:]
# base_str = 'diagnosis_bin~'
# variables = []
# for name in col_heads:
#     variables.append(name)
# for item in variables:
#     form_str = base_str + item
#     glm1 = smf.glm(formula = form_str, data = data, family = sm.families.Binomial()).fit()
# #     print "%-25s" "%s" % (form_str,glm1.deviance)

#     variables.remove(item) # remove current var from list
#     for item2 in variables:
#         form_str = form_str + "+" + item2
#         glm2 = smf.glm(formula = form_str, data = data, family = sm.families.Binomial()).fit()
#         print "%-25s" "%s" % (form_str,glm2.deviance)
    
    
compute_lowest_dev( col_heads = data.columns[3:] )

# for item in column
# run glm
# record deviance (low is better)
# for item in data
# run again for every other item added
# if deviance increases, remove it

['diagnosis_bin~radius_mean', 330.01084398756484]
['diagnosis_bin~texture_mean', 646.51912715335254]
['diagnosis_bin~perimeter_mean', 304.48439349326907]
['diagnosis_bin~area_mean', 325.65651114683459]
['diagnosis_bin~smoothness_mean', 673.94845655578297]
['diagnosis_bin~compactness_mean', 508.79191904240184]
['diagnosis_bin~concavity_mean', 383.22721594489536]
['diagnosis_bin~concave_points_mean', 258.92340741677572]
['diagnosis_bin~symmetry_mean', 686.79616952634956]
['diagnosis_bin~fractal_dimension_mean', 751.3459405512051]
['diagnosis_bin~radius_se', 480.64673027271238]
['diagnosis_bin~texture_se', 751.4006775989908]
['diagnosis_bin~perimeter_se', 472.82984384776779]
['diagnosis_bin~area_se', 359.50259177692715]
['diagnosis_bin~smoothness_se', 748.78826585432967]
['diagnosis_bin~compactness_se', 701.77473418661975]
['diagnosis_bin~concavity_se', 707.08633429396821]
['diagnosis_bin~concave_points_se', 646.00703847003092]
['diagnosis_bin~symmetry_se', 751.41574120907785]
['diagnosis

In [9]:
variables = []
for name in col_heads:
    variables.append(name)

k=0
results = []
joiner = '+'
for i in range(k,len(variables)):
    elements = variables[i]
    results.append(elements)
    for j in range(k+i+1,len(variables)):
        elements = [variables[i],variables[j]]
        fstr = joiner.join(elements)
        results.append(fstr)

print len(results)
print results
res1 = results

465
['radius_mean', 'radius_mean+texture_mean', 'radius_mean+perimeter_mean', 'radius_mean+area_mean', 'radius_mean+smoothness_mean', 'radius_mean+compactness_mean', 'radius_mean+concavity_mean', 'radius_mean+concave_points_mean', 'radius_mean+symmetry_mean', 'radius_mean+fractal_dimension_mean', 'radius_mean+radius_se', 'radius_mean+texture_se', 'radius_mean+perimeter_se', 'radius_mean+area_se', 'radius_mean+smoothness_se', 'radius_mean+compactness_se', 'radius_mean+concavity_se', 'radius_mean+concave_points_se', 'radius_mean+symmetry_se', 'radius_mean+fractal_dimension_se', 'radius_mean+radius_worst', 'radius_mean+texture_worst', 'radius_mean+perimeter_worst', 'radius_mean+area_worst', 'radius_mean+smoothness_worst', 'radius_mean+compactness_worst', 'radius_mean+concavity_worst', 'radius_mean+concave_points_worst', 'radius_mean+symmetry_worst', 'radius_mean+fractal_dimension_worst', 'texture_mean', 'texture_mean+perimeter_mean', 'texture_mean+area_mean', 'texture_mean+smoothness_mean

In [10]:

def foo(i,variables,depth):
    global level
    if level != depth:
        elements = variables[i]
        for j in range(i+1,len(variables)):
            elements = [variables[i],variables[j]]
            fstr = joiner.join(elements)
            results.append(fstr)
            level += 1
            foo(j,variables,depth)
    else:
        level -= 1

    
variables = []
for name in col_heads:
    variables.append(name)
level=1
depth = 2
results = []
joiner = '+'
for i in range(0,len(variables)):
    elements = variables[i]
    results.append(elements)
    foo(i,variables,depth)
    
print len(results)
print results
res2 = results

465
['radius_mean', 'radius_mean+texture_mean', 'radius_mean+perimeter_mean', 'radius_mean+area_mean', 'radius_mean+smoothness_mean', 'radius_mean+compactness_mean', 'radius_mean+concavity_mean', 'radius_mean+concave_points_mean', 'radius_mean+symmetry_mean', 'radius_mean+fractal_dimension_mean', 'radius_mean+radius_se', 'radius_mean+texture_se', 'radius_mean+perimeter_se', 'radius_mean+area_se', 'radius_mean+smoothness_se', 'radius_mean+compactness_se', 'radius_mean+concavity_se', 'radius_mean+concave_points_se', 'radius_mean+symmetry_se', 'radius_mean+fractal_dimension_se', 'radius_mean+radius_worst', 'radius_mean+texture_worst', 'radius_mean+perimeter_worst', 'radius_mean+area_worst', 'radius_mean+smoothness_worst', 'radius_mean+compactness_worst', 'radius_mean+concavity_worst', 'radius_mean+concave_points_worst', 'radius_mean+symmetry_worst', 'radius_mean+fractal_dimension_worst', 'texture_mean', 'texture_mean+perimeter_mean', 'texture_mean+area_mean', 'texture_mean+smoothness_mean

In [11]:
if res1 == res2:
    print "SUCCESS"
else:
    print "there was a problem"

SUCCESS


In [None]:

def foo(i,variables,depth, elements):
    level = len(elements)
    
    if level != depth:
        for j in range(i+1,len(variables)):
            new_ele = elements
            new_ele.append(variables[j])
            fstr = joiner.join(elements)
            results.append(fstr)
            foo(j,variables,depth,new_ele)
            
    else:
        pass

    
variables = []
for name in col_heads:
    variables.append(name)
depth = 3
results = []
joiner = '+'
for i in range(0,len(variables)):
    elements = variables[i]
    results.append(elements)
    elements = [variables[i]]
    foo(i,variables,depth,elements)
    
print len(results)
print results
res3 = results



#### 

In [12]:
variables = []
for name in col_heads:
    variables.append(name)

k=0
results = []
joiner = '+'
for i in range(k,len(variables)):
    elements = variables[i]
    results.append(elements)
    for j in range(i+1,len(variables)):
        elements = [variables[i],variables[j]]
        fstr = joiner.join(elements)
        results.append(fstr)
        for k in range(j+1,len(variables)):
            elements = [variables[i],variables[j],variables[k]]
            fstr = joiner.join(elements)
            results.append(fstr)
            for l in range(k+1,len(variables)):
                elements = [variables[i],variables[j],variables[k],variables[l]]
                fstr = joiner.join(elements)
                results.append(fstr)


print len(results)
print results
res4 = results


31930
['radius_mean', 'radius_mean+texture_mean', 'radius_mean+texture_mean+perimeter_mean', 'radius_mean+texture_mean+perimeter_mean+area_mean', 'radius_mean+texture_mean+perimeter_mean+smoothness_mean', 'radius_mean+texture_mean+perimeter_mean+compactness_mean', 'radius_mean+texture_mean+perimeter_mean+concavity_mean', 'radius_mean+texture_mean+perimeter_mean+concave_points_mean', 'radius_mean+texture_mean+perimeter_mean+symmetry_mean', 'radius_mean+texture_mean+perimeter_mean+fractal_dimension_mean', 'radius_mean+texture_mean+perimeter_mean+radius_se', 'radius_mean+texture_mean+perimeter_mean+texture_se', 'radius_mean+texture_mean+perimeter_mean+perimeter_se', 'radius_mean+texture_mean+perimeter_mean+area_se', 'radius_mean+texture_mean+perimeter_mean+smoothness_se', 'radius_mean+texture_mean+perimeter_mean+compactness_se', 'radius_mean+texture_mean+perimeter_mean+concavity_se', 'radius_mean+texture_mean+perimeter_mean+concave_points_se', 'radius_mean+texture_mean+perimeter_mean+symm

In [13]:
if res3 == res4:
    print "HOORAH"

NameError: name 'res3' is not defined

In [14]:
variables = []
for name in col_heads:
    variables.append(name)

k=0
results = []
joiner = '+'
for i in range(k,len(variables)):
    elements = variables[i]
    results.append(elements)
    for j in range(i+1,len(variables)):
        elements = [variables[i],variables[j]]
        fstr = joiner.join(elements)
        results.append(fstr)
        for k in range(j+1,len(variables)):
            elements = [variables[i],variables[j],variables[k]]
            fstr = joiner.join(elements)
            results.append(fstr)
            for l in range(k+1,len(variables)):
                elements = [variables[i],variables[j],variables[k],variables[l]]
                fstr = joiner.join(elements)
                results.append(fstr)


print len(results)


base_str = 'diagnosis_bin~'
devs = []
for suffix in results:
    formula = base_str+suffix
    glm = smf.glm(formula = formula, data = data, family = sm.families.Binomial()).fit()
    devs.append(glm.deviance)
    
print "Complete"

31930
Complete


In [15]:
devs
results

devs_sorted, forms_sorted = (list(t) for t in zip(*sorted(zip(devs, results))))

print "Complete"


Complete


In [16]:
for i in range(0,100):
    print "%-25s" "%s" % (devs_sorted[i],forms_sorted[i])

82.2911863344            radius_se+texture_worst+area_worst+concave_points_worst
82.3422101794            perimeter_mean+concave_points_mean+texture_worst+area_worst
82.6044838016            texture_mean+perimeter_mean+concave_points_mean+area_worst
83.0138167487            area_se+texture_worst+area_worst+concave_points_worst
83.4111987595            radius_mean+concave_points_mean+texture_worst+area_worst
84.4050011072            area_se+radius_worst+texture_worst+concave_points_worst
84.5602438041            radius_se+radius_worst+texture_worst+concave_points_worst
84.6983161579            radius_mean+texture_mean+concave_points_mean+area_worst
85.6311764281            area_mean+concave_points_mean+texture_worst+area_worst
87.2049522528            radius_se+texture_worst+perimeter_worst+smoothness_worst
87.4371615691            area_se+texture_worst+perimeter_worst+smoothness_worst
88.1053804603            texture_mean+area_mean+concave_points_mean+area_worst
88.2968776461          

In [18]:
base_str = 'diagnosis_bin~'
devs2 = []
for suffix in variables:
    formula = base_str+suffix
    glm = smf.glm(formula = formula, data = data, family = sm.families.Binomial()).fit()
    devs2.append(glm.deviance)
    
print "Complete"

devs2_sorted, vars_sorted = (list(t) for t in zip(*sorted(zip(devs2, variables))))


for i in range(0,30):
    print "%-25s" "%s" % (devs2_sorted[i],vars_sorted[i])

Complete
209.47994078             perimeter_worst
229.108516988            radius_worst
230.639323335            area_worst
250.450768147            concave_points_worst
258.923407417            concave_points_mean
304.484393493            perimeter_mean
325.656511147            area_mean
330.010843988            radius_mean
359.502591777            area_se
383.227215945            concavity_mean
437.69762011             concavity_worst
472.829843848            perimeter_se
480.646730273            radius_se
505.552950418            compactness_worst
508.791919042            compactness_mean
622.068225887            texture_worst
641.41675566             symmetry_worst
641.424953525            smoothness_worst
646.00703847             concave_points_se
646.519127153            texture_mean
673.948456556            smoothness_mean
686.796169526            symmetry_mean
689.389445032            fractal_dimension_worst
701.774734187            compactness_se
707.086334294            conca

In [29]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

print "DATA SPLIT BETWEEN B / M:\n"
print pd.value_counts(data['diagnosis'].values, sort=False)
print len(data['diagnosis'])
print " "

print "NEW DATA"
print pd.value_counts(train['diagnosis'].values, sort=False)
print len(train['diagnosis'])

formula = 'diagnosis_bin~radius_se+texture_worst+area_worst+concave_points_worst'
glm_best = smf.glm(formula = formula, data = data, family = sm.families.Binomial()).fit()
print glm_best.summary()

DATA SPLIT BETWEEN B / M:

B    357
M    212
dtype: int64
569
 
NEW DATA
B    286
M    169
dtype: int64
455
                 Generalized Linear Model Regression Results                  
Dep. Variable:          diagnosis_bin   No. Observations:                  569
Model:                            GLM   Df Residuals:                      564
Model Family:                Binomial   Df Model:                            4
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                    nan
Date:                Sat, 24 Jun 2017   Deviance:                       82.291
Time:                        16:08:13   Pearson chi2:                     255.
No. Iterations:                    10                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
Int

In [22]:

# Note that labels = [1,0] ensures that are 'positive' class corresponds to Survived_bin = 1.
conf_mat = confusion_matrix(data['diagnosis_bin'], glm_best.predict()>0.5, labels = [1, 0])
print conf_mat

TP = float(conf_mat[0][0])
FN = float(conf_mat[0][1])
FP = float(conf_mat[1][0])
TN = float(conf_mat[1][1])

print 'Accuracy: %.3f' % ((TP+TN)/(TP+TN+FN+FP))
print 'Precision: %.3f' % (TP/(TP+FP))
print 'Recall: %.3f' % (TP/(TP+FN))

[[202  10]
 [  9 348]]
Accuracy: 0.967
Precision: 0.957
Recall: 0.953
