# ML Classification using Breast Cancer Dataset

# Reading file from Google Drive into Collab Notebook

In [0]:
!pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Read the Folder ID from Google Drive (Shareable link)

In [2]:
# change the appropriate folderid of the Data folder as per Google drive for the folder which has the data files
# if set correctly this should list all the files in your data folder

file_list = drive.ListFile({'q': "'1oTFfdUNDEAXnO0c6pBhMeLP7nEixVfil' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

title: ML- Classification using Breast Cancer Dataset.ipynb, id: 1ESyXMXLQ0BIgeBK3UtBSDyHclFSNtwzL
title: wcbreast_wdbc.csv, id: 0B3iuBB_4YUyJMEpySzJ2SktWMlFTcHhJemtqdHYwRnA5ZXpB


# Read Breast Cancer Dataset (wcbreast_wdbc.csv) from Google Drive into Pandas

In [0]:
# Input the fileid for the wcbreast_wdbc.csv data file as per Google drive

file_id='0B3iuBB_4YUyJMEpySzJ2SktWMlFTcHhJemtqdHYwRnA5ZXpB'
file_name='wcbreast_wdbc.csv'

train_downloaded = drive.CreateFile({'id':file_id })
train_downloaded.GetContentFile(file_name) 

In [4]:
import pandas as pd
data = pd.read_csv(file_name,na_values=['NA','?'])
data.head()

Unnamed: 0,id,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Import required Libraries

In [0]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics

# Helper Functions

In [0]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low
        


In [7]:
data.head()

Unnamed: 0,id,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
##pd.isnull(data)
data.isnull().values.any()

False

In [9]:
data.isnull().sum()

id                         0
diagnosis                  0
mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  0
mean_smoothness            0
mean_compactness           0
mean_concavity             0
mean_concave_points        0
mean_symmetry              0
mean_fractal_dimension     0
se_radius                  0
se_texture                 0
se_perimeter               0
se_area                    0
se_smoothness              0
se_compactness             0
se_concavity               0
se_concave_points          0
se_symmetry                0
se_fractal_dimension       0
worst_radius               0
worst_texture              0
worst_perimeter            0
worst_area                 0
worst_smoothness           0
worst_compactness          0
worst_concavity            0
worst_concave_points       0
worst_symmetry             0
worst_fractal_dimension    0
dtype: int64

In [0]:
data.drop('id',axis=1,inplace=True)

In [11]:
data.head()

Unnamed: 0,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [12]:
data.shape

(569, 31)

In [0]:
# Drop outliers 
#print("Length before outliers dropped: {}".format(len(data)))
#remove_outliers(data,'label',2)
#print("Length after  outliers dropped: {}".format(len(data)))


In [13]:
#Confirm outlier rows dropped
data.shape

(569, 31)

In [14]:
data.columns

Index(['diagnosis', 'mean_radius', 'mean_texture', 'mean_perimeter',
       'mean_area', 'mean_smoothness', 'mean_compactness', 'mean_concavity',
       'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension',
       'se_radius', 'se_texture', 'se_perimeter', 'se_area', 'se_smoothness',
       'se_compactness', 'se_concavity', 'se_concave_points', 'se_symmetry',
       'se_fractal_dimension', 'worst_radius', 'worst_texture',
       'worst_perimeter', 'worst_area', 'worst_smoothness',
       'worst_compactness', 'worst_concavity', 'worst_concave_points',
       'worst_symmetry', 'worst_fractal_dimension'],
      dtype='object')

In [15]:
preprocess = True

if preprocess:
  encode_text_index(data,'diagnosis')
  
  encode_numeric_zscore(data,'mean_radius')
  encode_numeric_zscore(data,'mean_texture')
  encode_numeric_zscore(data,'mean_perimeter')
  encode_numeric_zscore(data,'mean_area')
  encode_numeric_zscore(data,'mean_smoothness')
  encode_numeric_zscore(data,'mean_compactness')
  encode_numeric_zscore(data,'mean_concavity')
  encode_numeric_zscore(data,'mean_concave_points')
  encode_numeric_zscore(data,'mean_symmetry')  
  encode_numeric_zscore(data,'mean_fractal_dimension')
  encode_numeric_zscore(data,'se_radius')
  encode_numeric_zscore(data,'se_texture')
  encode_numeric_zscore(data,'se_perimeter')
  encode_numeric_zscore(data,'se_area')
  encode_numeric_zscore(data,'se_smoothness')
  encode_numeric_zscore(data,'se_compactness') 
  encode_numeric_zscore(data,'se_concavity')
  encode_numeric_zscore(data,'se_concave_points')
  encode_numeric_zscore(data,'se_symmetry')
  encode_numeric_zscore(data,'se_fractal_dimension')  
  encode_numeric_zscore(data,'worst_radius')
  encode_numeric_zscore(data,'worst_texture')
  encode_numeric_zscore(data,'worst_perimeter')
  encode_numeric_zscore(data,'worst_area')
  encode_numeric_zscore(data,'worst_smoothness')
  encode_numeric_zscore(data,'worst_compactness')
  encode_numeric_zscore(data,'worst_concavity')
  encode_numeric_zscore(data,'worst_concave_points')
  encode_numeric_zscore(data,'worst_symmetry')
  encode_numeric_zscore(data,'worst_fractal_dimension')

data.head()



Unnamed: 0,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,1,1.0961,-2.071512,1.268817,0.98351,1.567087,3.280628,2.650542,2.530249,2.215566,...,1.885031,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312
1,1,1.828212,-0.353322,1.684473,1.90703,-0.826235,-0.486643,-0.023825,0.547662,0.001391,...,1.80434,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.14662,1.086129,-0.243675,0.280943
2,1,1.578499,0.455786,1.565126,1.557513,0.941382,1.052,1.36228,2.03544,0.938859,...,1.510541,-0.023953,1.346291,1.455004,0.526944,1.08198,0.854222,1.953282,1.151242,0.201214
3,1,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,...,-0.281217,0.133866,-0.24972,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672
4,1,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,...,1.297434,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.61264,0.728618,-0.86759,-0.396751


# Explore Dataset after Normalization

In [0]:
# conventional way to import seaborn
import seaborn as sns;sns.set()

# allow plots to appear within the notebook
%matplotlib inline

import matplotlib.pyplot as plt

In [0]:
# visualize the relationship between the features using scatterplots
# g = sns.PairGrid(data)
# g = g.map(plt.scatter)

# Separate Features and Target objects (X,y)

In [0]:
X,y = to_xy(data,'diagnosis')

In [19]:
y[0]

array([0., 1.], dtype=float32)

In [20]:
y[1]

array([0., 1.], dtype=float32)

In [21]:
# Encode to a 2D matrix for training

X,y = to_xy(data,'diagnosis')

print("Observe shape of y before ...")

print(X.shape,y.shape)

y = np.argmax(y,axis=1) # convert y to have 1 column max of 0 and 1

print("Observe shape of y now ...")

print(X.shape,y.shape)

Observe shape of y before ...
(569, 30) (569, 2)
Observe shape of y now ...
(569, 30) (569,)


In [22]:
y[1]

1

# Split Dataset into Training and Test Data

In [23]:


# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

X_train.shape,y_train.shape,X_test.shape,y_test.shape

((455, 30), (455,), (114, 30), (114,))

In [24]:
y_train[0:5]

array([0, 1, 0, 0, 0])

# Import Algorithm

In [0]:
# import model
from sklearn.linear_model import LogisticRegression

# Instantiate Algorithm Class

In [0]:
# instantiate
model = LogisticRegression()

# Fit Model

In [27]:
# fit the model to the training data (learn the coefficients)

model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Predict 

In [0]:

y_pred = model.predict(X_test)

In [29]:
y_pred[0:5]

array([0, 1, 1, 0, 0])

# Evaluate

In [30]:
score = metrics.accuracy_score(y_test, y_pred)
print("Accuracy score: {}".format(score))


Accuracy score: 0.9736842105263158
