In [96]:
# Required Python Packages
%matplotlib inline
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from IPython.display import clear_output

In [97]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"

In [98]:
# Dataset File Paths on local machine
OUTPUT_PATH = "./breast-cancer-wisconsin.csv"

#Headers
Headers = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion", "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "CancerType"]

tf.logging.set_verbosity(tf.logging.ERROR)

In [99]:
def handle_missings(dataset, missing_vals, drop_cols = []): 
    dataset.replace(missing_vals, np.NaN, inplace=True)
    dataset.dropna(inplace=True)
    #print (dataset['BareNuclei'].loc[25:])
    #dataset.BareNuclei.hist(bins = 10)
    #dataset.ClumpThickness.value_counts().plot(kind='barh')
    return dataset

In [100]:
def data_file_to_csv():
    dataframe = pd.read_csv(url, delimiter = ",", names = Headers)
    #print (dataframe.describe())
    dataframe.to_csv(OUTPUT_PATH, index=False)
    #print (dataframe[dataframe.BareNuclei == "?"])

In [101]:
def train_input_fn(df, labels):
    return tf.estimator.inputs.pandas_input_fn(x = df, queue_capacity = 1000,
                                               y = labels, batch_size = 16, shuffle = True, num_epochs = 20)

In [102]:
def eval_input_fn(df, labels):
    return tf.estimator.inputs.pandas_input_fn(x = df, queue_capacity = 1000,
                                               y = labels, batch_size = 16, shuffle = True, num_epochs = 20)

In [103]:
data_file_to_csv()
dataset = pd.read_csv(OUTPUT_PATH)
dataset = handle_missings(dataset, '?', Headers[6])
#print (dataset['BareNuclei'].loc[23:])#.isna())#.isna())
feature_headers = Headers[1:-1]
target_label = Headers[-1]
dataset['BareNuclei'] = dataset['BareNuclei'].astype(np.int64)#.size())#str.find("?"))
lab_enc = preprocessing.LabelEncoder()
dataset[target_label] = lab_enc.fit_transform(dataset[target_label]) #Encode labels with value between 0 and n_classes-1
#print (dataset[target_label].unique())

feat_columns = [tf.feature_column.numeric_column(k) for k in feature_headers]

X_train, x_test, y_train, y_test = train_test_split(dataset[feature_headers], dataset[target_label], train_size=0.85)
print ("Train set features {} & label {} shapes".format(X_train.shape, y_train.shape))
print ("Test set features {} & labels {} shapes".format(x_test.shape, y_test.shape))

Train set features (580, 9) & label (580,) shapes
Test set features (103, 9) & labels (103,) shapes




In [104]:
def linear_classifier():
    model = tf.estimator.DNNClassifier(feature_columns=feat_columns, hidden_units=[32, 24, 6], model_dir = None, n_classes=2)
    trained_model = model.train(input_fn=train_input_fn(X_train, y_train), steps=1000)
    metrics = model.evaluate(input_fn=eval_input_fn(x_test, y_test), steps = 5)
    return metrics

In [105]:
def trees_classifier():
    model = tf.estimator.BoostedTreesClassifier(feature_columns=feat_columns,
                                          n_batches_per_layer=5, n_trees = 300)
    trained_model = model.train(input_fn=train_input_fn(X_train, y_train), max_steps=100)
    metrics = model.evaluate(input_fn=eval_input_fn(x_test, y_test), steps=4)
    return metrics

In [106]:
if __name__ == "__main__":
    metrics_linear = linear_classifier()
    print ("Pre-made DNNClassifier estimator results")
    for key in sorted(metrics_linear):
        print ('%s : %s' % (key, metrics_linear[key]))
    metrics_boosted = trees_classifier()
    print ("-----------------------------------------")
    print ("Pre-made Boosted Tree Classifier results")    
    for bkey in sorted(metrics_boosted):
        print ('%s %s' % (bkey, metrics_boosted[bkey]))
        

Pre-made DNNClassifier estimator results
accuracy : 1.0
accuracy_baseline : 0.65
auc : 0.99999994
auc_precision_recall : 0.99999994
average_loss : 0.038199257
global_step : 725
label/mean : 0.35
loss : 0.6111881
precision : 1.0
prediction/mean : 0.35063267
recall : 1.0
-----------------------------------------
Pre-made Boosted Tree Classifier results
accuracy 0.96875
accuracy_baseline 0.703125
auc 0.98362577
auc_precision_recall 0.9551789
average_loss 0.17750785
global_step 100
label/mean 0.296875
loss 0.17750785
precision 0.9047619
prediction/mean 0.33555895
recall 1.0
