In [273]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from scipy import signal

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

#for accuracy_score, classification_report and confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score

# to make this notebook's output stable across runs
np.random.seed(42)

In [274]:
dataset = pd.read_csv("breast-cancer-wisconsin.csv")

In [275]:
print(dataset.describe())

       Clump Thickness  Uniformity of Cell Size  Uniformity of Cell Shape  \
count       699.000000               699.000000                699.000000   
mean          4.417740                 3.134478                  3.207439   
std           2.815741                 3.051459                  2.971913   
min           1.000000                 1.000000                  1.000000   
25%           2.000000                 1.000000                  1.000000   
50%           4.000000                 1.000000                  1.000000   
75%           6.000000                 5.000000                  5.000000   
max          10.000000                10.000000                 10.000000   

       Marginal Adhesion   Single Epithelial Cell Size  Bland Chromatin  \
count         699.000000                    699.000000       699.000000   
mean            2.806867                      3.216023         3.437768   
std             2.855379                      2.214300         2.438364   
min   

In [276]:
num_missing = (dataset[['Bare Nuclei']] == '?').sum()
print(num_missing)

Bare Nuclei    16
dtype: int64


In [277]:
num_missing = (dataset[['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', ' Single Epithelial Cell Size', 'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses']] == '?').sum()
print(num_missing)

Clump Thickness                  0
Uniformity of Cell Size          0
Uniformity of Cell Shape         0
Marginal Adhesion                0
 Single Epithelial Cell Size     0
Bare Nuclei                     16
Bland Chromatin                  0
Normal Nucleoli                  0
Mitoses                          0
dtype: int64


  res_values = method(rvalues)


In [278]:
# example of imputing missing values using scikit-learn
from numpy import nan
from numpy import isnan
from numpy import around

from sklearn.impute import SimpleImputer
# load the dataset

# mark zero values as missing or NaN
dataset = dataset.replace('?', nan)
dataset_attributes = dataset.iloc[:,0:9]
# dataset[['class']] = dataset[['class']].replace('class1', 0)
# dataset[['class']] = dataset[['class']].replace('class2', 1)
# retrieve the numpy array
# values = dataset.values
dataset_attributes.head(25)

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1.0,3,1,1
1,5,4,4,5,7,10.0,3,2,1
2,3,1,1,1,2,2.0,3,1,1
3,6,8,8,1,3,4.0,3,7,1
4,4,1,1,3,2,1.0,3,1,1
5,8,10,10,8,7,10.0,9,7,1
6,1,1,1,1,2,10.0,3,1,1
7,2,1,2,1,2,1.0,3,1,1
8,2,1,1,1,2,1.0,1,1,5
9,4,2,1,1,2,1.0,2,1,1


In [279]:
num_missing = (dataset[['Bare Nuclei']] == '?').sum()
print(num_missing)

Bare Nuclei    0
dtype: int64


In [280]:
imputer = SimpleImputer(missing_values=nan, strategy='mean')
attributes = imputer.fit_transform(dataset_attributes)
print(attributes)

[[ 5.  1.  1. ...  3.  1.  1.]
 [ 5.  4.  4. ...  3.  2.  1.]
 [ 3.  1.  1. ...  3.  1.  1.]
 ...
 [ 5. 10. 10. ...  8. 10.  2.]
 [ 4.  8.  6. ... 10.  6.  1.]
 [ 4.  8.  8. ... 10.  4.  1.]]


In [281]:
from sklearn.preprocessing import MinMaxScaler

In [290]:
# define min max scaler
scaler = MinMaxScaler()
# transform data
scaled = scaler.fit_transform(attributes)
print(scaled)

[[0.44444444 0.         0.         ... 0.22222222 0.         0.        ]
 [0.44444444 0.33333333 0.33333333 ... 0.22222222 0.11111111 0.        ]
 [0.22222222 0.         0.         ... 0.22222222 0.         0.        ]
 ...
 [0.44444444 1.         1.         ... 0.77777778 1.         0.11111111]
 [0.33333333 0.77777778 0.55555556 ... 1.         0.55555556 0.        ]
 [0.33333333 0.77777778 0.77777778 ... 1.         0.33333333 0.        ]]


In [292]:
data_list = []
for i in scaled:
    print(i)
    temp = []
    for j in i:
        temp.append('%0.4f' % j)
    data_list.append(temp)
# print(data_list)
# print(scaled)

[0.44444444 0.         0.         0.         0.11111111 0.
 0.22222222 0.         0.        ]
[0.44444444 0.33333333 0.33333333 0.44444444 0.66666667 1.
 0.22222222 0.11111111 0.        ]
[0.22222222 0.         0.         0.         0.11111111 0.11111111
 0.22222222 0.         0.        ]
[0.55555556 0.77777778 0.77777778 0.         0.22222222 0.33333333
 0.22222222 0.66666667 0.        ]
[0.33333333 0.         0.         0.22222222 0.11111111 0.
 0.22222222 0.         0.        ]
[0.77777778 1.         1.         0.77777778 0.66666667 1.
 0.88888889 0.66666667 0.        ]
[0.         0.         0.         0.         0.11111111 1.
 0.22222222 0.         0.        ]
[0.11111111 0.         0.11111111 0.         0.11111111 0.
 0.22222222 0.         0.        ]
[0.11111111 0.         0.         0.         0.11111111 0.
 0.         0.         0.44444444]
[0.33333333 0.11111111 0.         0.         0.11111111 0.
 0.11111111 0.         0.        ]
[0.         0.         0.         0.        

In [284]:
# Get a list of all the classes
classes = dataset.iloc[:, -1].tolist()

In [285]:
# Label encoding
from sklearn.preprocessing import LabelEncoder

labels = np.unique(classes)
print(labels)

lEnc = LabelEncoder()
lEnc.fit(labels)

label_encoder = lEnc.transform(classes)

numClass = len(labels)
label_encoder=label_encoder.astype(np.float64)

['class1' 'class2']


In [288]:
# dataset_normalised.to_csv("breast-cancer-wisconsin-normalised.csv")
# Print the preprocessed data
    
def P():
    print(len(label_encoder))
    for i in range(len(data_list)):
        for j in data_list[i]:
            print(j, end =',')
        if i < len(data_list) - 1:
            print(int(label_encoder[i]))
        else:
            print(int(label_encoder[i]), end='')
    

In [289]:
P()

699
0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.4444,0.3333,0.3333,0.4444,0.6667,1.0000,0.2222,0.1111,0.0000,0
0.2222,0.0000,0.0000,0.0000,0.1111,0.1111,0.2222,0.0000,0.0000,0
0.5556,0.7778,0.7778,0.0000,0.2222,0.3333,0.2222,0.6667,0.0000,0
0.3333,0.0000,0.0000,0.2222,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.7778,1.0000,1.0000,0.7778,0.6667,1.0000,0.8889,0.6667,0.0000,1
0.0000,0.0000,0.0000,0.0000,0.1111,1.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.1111,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.4444,0
0.3333,0.1111,0.0000,0.0000,0.1111,0.0000,0.1111,0.0000,0.0000,0
0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.1111,0.0000,0.0000,0
0.4444,0.2222,0.2222,0.2222,0.1111,0.2222,0.3333,0.3333,0.0000,1
0.0000,0.0000,0.0000,0.0000,0.1111,0.2222,0.2222,0.0000,0.0000,0
0.7778,0.6667,0.4444,1.0000,0.6667,0.8889,0.4444,0.4444,0.3333,1
0.6667,0.3333,0.5556,

1.0000,1.0000,1.0000,0.0000,0.5556,0.0000,0.1111,0.7778,0.0000,1
0.3333,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.0000,0
0.3333,0.0000,0.2222,0.2222,0.1111,0.0000,0.0000,0.0000,0.0000,0
0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.0000,0
1.0000,0.3333,0.2222,1.0000,0.3333,1.0000,1.0000,0.0000,0.0000,1
0.4444,0.1111,0.1111,0.3333,0.1111,0.3333,0.0000,0.0000,0.0000,0
0.0000,0.0000,0.0000,0.2222,0.1111,0.2222,0.0000,0.0000,0.0000,0
0.0000,0.0000,0.0000,0.0000,0.1111,0.1111,0.0000,0.0000,0.0000,0
0.4444,0.0000,0.0000,0.5556,0.2222,0.0000,0.1111,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.0000,0
0.0000,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.0000,0
0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.0000,0
0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0
0.4444,0.6667,0.8889,0.7778,0.5556,1.0000,0.7778,1.0000,0.0000,1
0.3333,0.0000,0.0000,0.2222,0.0000,0.0000,0.1111,0.0000,0.0000,0
0.4444,0.0000,0.0000,0.00

In [187]:
# Read the config file
def parameter_file(filename):
    parameter_file = pd.read_csv(filename)
 
    # Convert parameters to a list:
    parameters = parameter_file.iloc[0].tolist()
    return parameters

In [188]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV
# 10-fold cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

In [192]:
X = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]
# K-Nearest Neighbour
from sklearn.neighbors import KNeighborsClassifier


def kNNClassifier(X,y,K):
    
    neigh = KNeighborsClassifier(n_neighbors=K)
    scores = cross_val_score(neigh, np.asarray(X, dtype='float64'), y, cv=cvKFold)
    
    print("{:.4f}".format(scores.mean()), end='')
    return scores, scores.mean()

K = parameter_file('paramNN.csv')
print(K)
scores, scores_mean = kNNClassifier(attributes, label_encoder, int(K[0]))

[3]
0.9642

In [193]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

def logregClassifier(X, y):
    
    lr = LogisticRegression(random_state=0)
    scores = cross_val_score(lr, np.asarray(X, dtype='float64'), y, cv=cvKFold)
    
    print("{:.4f}".format(scores.mean()), end='')
    return scores, scores.mean()

scores, scores_mean = logregClassifier(attributes, label_encoder)    

0.9657

In [194]:
# Naïve Bayes
from sklearn.naive_bayes import GaussianNB

def nbClassifier(X, y):
    
    nb = GaussianNB()
    scores = cross_val_score(nb, np.asarray(X, dtype='float64'), y, cv=cvKFold)
    
    print("{:.4f}".format(scores.mean()), end='')
    return scores, scores.mean()

scores, scores_mean = nbClassifier(attributes, label_encoder) 


0.9585

In [195]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

def dtClassifier(X, y):
    
    dt = DecisionTreeClassifier(criterion='entropy', random_state=0)
    scores = cross_val_score(dt, np.asarray(X, dtype='float64'), y, cv=cvKFold)
    
    print("{:.4f}".format(scores.mean()), end='')
    return scores, scores.mean()

scores, scores_mean = dtClassifier(attributes, label_encoder) 

0.9342

In [203]:
# Bagging
from sklearn.ensemble import BaggingClassifier

def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    
    bag = BaggingClassifier(DecisionTreeClassifier(max_depth=max_depth, criterion='entropy', random_state=0), 
                            n_estimators=n_estimators, max_samples=max_samples, random_state=0)
    scores = cross_val_score(bag, np.asarray(X, dtype='float64'), y, cv=cvKFold)
    
    print("{:.4f}".format(scores.mean()), end='')
    return scores, scores.mean()

K = parameter_file('paramBAG.csv')
print(K)
scores, scores_mean = bagDTClassifier(attributes, label_encoder, K[0], K[1], K[2]) 


[100, 100, 2]
0.9599

In [205]:
# Ada Boost
from sklearn.ensemble import AdaBoostClassifier

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    
    ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth,criterion='entropy', random_state=0), 
                                 n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = cross_val_score(ada, np.asarray(X, dtype='float64'), y, cv=cvKFold)
    
    print("{:.4f}".format(scores.mean()), end='')
    return scores, scores.mean()

K = parameter_file('paramADA.csv')
print(K)
scores, scores_mean = adaDTClassifier(attributes, label_encoder, int(K[0]), K[1], K[2]) 



[100.0, 0.2, 3.0]
0.9557

In [207]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

def gbClassifier(X, y, n_estimators, learning_rate):
    
    gb = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = cross_val_score(gb, np.asarray(X, dtype='float64'), y, cv=cvKFold)
    
    print("{:.4f}".format(scores.mean()), end='')
    return scores, scores.mean()

K = parameter_file('paramGB.csv')
print(K)
scores, scores_mean = gbClassifier(attributes, label_encoder, int(K[0]), K[1]) 

[100.0, 0.2]
0.9614

In [208]:
# Linear SVM
from sklearn.svm import SVC

def bestLinClassifier(X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'gamma': [0.001, 0.01, 0.1, 1, 10, 100]
    }
    
    grid_search = GridSearchCV(SVC(kernel="linear", random_state=0), 
                               param_grid, cv=cvKFold, return_train_score=True)
    grid_search.fit(X_train, y_train)
    
    print(grid_search.best_params_['C'])
    print(grid_search.best_params_['gamma'])
    
    print("{:.4f}".format(grid_search.best_score_))
    print("{:.4f}".format(grid_search.score(X_test, y_test)), end='')
    
bestLinClassifier(attributes, label_encoder)


0.01
0.001
0.9657
0.9714

In [210]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

def bestRFClassifier(X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    param_grid = {
        'n_estimators': [10, 30],
        'max_features': ['sqrt'],
        'max_leaf_nodes': [4, 16]
    }
    
    grid_search = GridSearchCV(RandomForestClassifier(random_state=0, criterion='entropy'), 
                               param_grid, cv=cvKFold, return_train_score=True)
    grid_search.fit(X_train, y_train)
    
    print(grid_search.best_params_['n_estimators'])
    print(grid_search.best_params_['max_features'])
    print(grid_search.best_params_['max_leaf_nodes'])
    
    print("{:.4f}".format(grid_search.best_score_))
    print("{:.4f}".format(grid_search.score(X_test, y_test)), end='')

bestRFClassifier(attributes, label_encoder)

30
sqrt
4
0.9675
0.9600

In [213]:
df = pd.DataFrame([attributes,label_encoder])
print(df)

                                                   0
0  [[5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0]...
1  [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...


  values = np.array([convert(v) for v in values])


5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0
5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,0
3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,0
6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,0
4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,0
8.0,10.0,10.0,8.0,7.0,10.0,9.0,7.0,1.0,1
1.0,1.0,1.0,1.0,2.0,10.0,3.0,1.0,1.0,0
2.0,1.0,2.0,1.0,2.0,1.0,3.0,1.0,1.0,0
2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,5.0,0
4.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0
1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,0
2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0
5.0,3.0,3.0,3.0,2.0,3.0,4.0,4.0,1.0,1
1.0,1.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,0
8.0,7.0,5.0,10.0,7.0,9.0,5.0,5.0,4.0,1
7.0,4.0,6.0,4.0,6.0,1.0,4.0,3.0,1.0,1
4.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0
4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0
10.0,7.0,7.0,6.0,4.0,10.0,4.0,1.0,2.0,1
6.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0
7.0,3.0,2.0,10.0,5.0,10.0,5.0,4.0,4.0,1
10.0,5.0,5.0,3.0,6.0,7.0,7.0,10.0,1.0,1
3.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0
8.0,4.0,5.0,1.0,2.0,3.5446559297218156,7.0,3.0,1.0,1
1.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0
5.0,2.0,3.0,4.0,2.0,7.0

2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0
2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0
3.0,3.0,2.0,2.0,3.0,1.0,1.0,2.0,3.0,0
7.0,6.0,6.0,3.0,2.0,10.0,7.0,1.0,1.0,1
5.0,3.0,3.0,2.0,3.0,1.0,3.0,1.0,1.0,0
2.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,0
5.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,1.0,0
1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,0
10.0,8.0,7.0,4.0,3.0,10.0,7.0,9.0,1.0,1
3.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
1.0,2.0,3.0,1.0,2.0,1.0,2.0,1.0,1.0,0
3.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0
3.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0
4.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0
3.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,0
1.0,2.0,3.0,1.0,2.0,1.0,1.0,1.0,1.0,0
3.0,10.0,8.0,7.0,6.0,9.0,9.0,3.0,8.0,1
3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0
5.0,3.0,3.0,1.0,2.0,1.0,2.0,1.0,1.0,0
3.0,1.0,1.0,1.0,2.0,4.0,1.0,1.0,1.0,0
1.0,2.0,1.0,3.0,2.0,1.0,1.0,2.0,1.0,0
1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0
4.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0
1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0
2.0,3.0,2.0,2.0,2.0,2.0,3.0,1.0,1.0,0
3.0,1.0,