In [64]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [13]:
wdbc_names = ['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness', 'concavity', 'concave_points', 'symmetry', 'fractal']

In [14]:
wdbc_columns = ['id', 'diagnosis']

for name in wdbc_names:
    wdbc_columns.append(name + '_mean')
    wdbc_columns.append(name + '_se')
    wdbc_columns.append(name + '_largest')

wdbc_columns

['id',
 'diagnosis',
 'radius_mean',
 'radius_se',
 'radius_largest',
 'texture_mean',
 'texture_se',
 'texture_largest',
 'perimeter_mean',
 'perimeter_se',
 'perimeter_largest',
 'area_mean',
 'area_se',
 'area_largest',
 'smoothness_mean',
 'smoothness_se',
 'smoothness_largest',
 'compactness_mean',
 'compactness_se',
 'compactness_largest',
 'concavity_mean',
 'concavity_se',
 'concavity_largest',
 'concave_points_mean',
 'concave_points_se',
 'concave_points_largest',
 'symmetry_mean',
 'symmetry_se',
 'symmetry_largest',
 'fractal_mean',
 'fractal_se',
 'fractal_largest']

In [16]:
wdbc = pd.read_csv('wdbc-data.csv', header=None)
wdbc.columns = wdbc_columns
wdbc.head()

Unnamed: 0,id,diagnosis,radius_mean,radius_se,radius_largest,texture_mean,texture_se,texture_largest,perimeter_mean,perimeter_se,...,concavity_largest,concave_points_mean,concave_points_se,concave_points_largest,symmetry_mean,symmetry_se,symmetry_largest,fractal_mean,fractal_se,fractal_largest
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [22]:
wdbc_features = wdbc.iloc[:, 2:]
wdbc_features.head()

Unnamed: 0,radius_mean,radius_se,radius_largest,texture_mean,texture_se,texture_largest,perimeter_mean,perimeter_se,perimeter_largest,area_mean,...,concavity_largest,concave_points_mean,concave_points_se,concave_points_largest,symmetry_mean,symmetry_se,symmetry_largest,fractal_mean,fractal_se,fractal_largest
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [77]:
wdbc_labels = wdbc['diagnosis']
wdbc_labels = pd.Series([1 if i == 'M' else 0 for i in wdbc_labels])
wdbc_labels.head()

0    1
1    1
2    1
3    1
4    1
dtype: int64

In [78]:
knn = KNeighborsClassifier(n_neighbors=2)
dtree = DecisionTreeClassifier()
lsvm = LinearSVC()
nnet = MLPClassifier()
ada = AdaBoostClassifier()
rf = RandomForestClassifier()

In [94]:
cv = StratifiedKFold(n_splits=10, shuffle=True)
outcomes = {knn:{'accuracy':[], 'f1_score':[]}, 
            dtree:{'accuracy':[], 'f1_score':[]}, 
            lsvm:{'accuracy':[], 'f1_score':[]}, 
            nnet:{'accuracy':[], 'f1_score':[]}, 
            ada:{'accuracy':[], 'f1_score':[]}, 
            rf:{'accuracy':[], 'f1_score':[]}}

for clf in outcomes:
    print('Classifier:', clf)
    for train_index, test_index in cv.split(wdbc_features, wdbc_labels):
    #     print('Train:', train_index, 'Test:', test_index)

        X_train, X_test = wdbc_features.values[train_index], wdbc_features.values[test_index]
        y_train, y_test = wdbc_labels.values[train_index], wdbc_labels.values[test_index]

        clf.fit(X_train, y_train)
        y_pred = knn.predict(X_test)

        outcomes[clf]['accuracy'].append(accuracy_score(y_test, y_pred))
        outcomes[clf]['f1_score'].append(f1_score(y_test, y_pred))


Classifier: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform')
Classifier: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
Classifier: MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0

In [95]:
for clf in outcomes:
    print('Accuracy:', outcomes[clf]['accuracy'])
    print('F1_Score:', outcomes[clf]['f1_score'])

Accuracy: [0.9137931034482759, 0.9482758620689655, 0.8947368421052632, 0.9298245614035088, 0.8421052631578947, 0.9298245614035088, 0.9649122807017544, 0.9642857142857143, 0.9107142857142857, 0.8214285714285714]
F1_Score: [0.8780487804878049, 0.9268292682926829, 0.8500000000000001, 0.9, 0.742857142857143, 0.8947368421052632, 0.9500000000000001, 0.9500000000000001, 0.8717948717948718, 0.7058823529411765]
Accuracy: [0.896551724137931, 0.9137931034482759, 0.9473684210526315, 0.9122807017543859, 0.9298245614035088, 0.9473684210526315, 0.9122807017543859, 1.0, 0.9821428571428571, 0.9107142857142857]
F1_Score: [0.85, 0.8717948717948718, 0.923076923076923, 0.8648648648648648, 0.8947368421052632, 0.923076923076923, 0.8648648648648648, 1.0, 0.975609756097561, 0.8648648648648648]
Accuracy: [0.9310344827586207, 0.9310344827586207, 0.9298245614035088, 0.9298245614035088, 0.9298245614035088, 0.9122807017543859, 0.9473684210526315, 0.9285714285714286, 0.9821428571428571, 0.9285714285714286]
F1_Score:

In [96]:
classifiers = ['knn', 'dtree', 'lsvm', 'nnet', 'ada', 'rf']
avg_accuracy = {'knn':[], 'dtree':[], 'lsvm':[], 'nnet':[], 'ada':[], 'rf':[]}
avg_f1_score = {'knn':[], 'dtree':[], 'lsvm':[], 'nnet':[], 'ada':[], 'rf':[]}

for idx, clf in enumerate(outcomes):
    avg_accuracy[classifiers[idx]] = np.mean(outcomes[clf]['accuracy'])
    avg_f1_score[classifiers[idx]] = np.mean(outcomes[clf]['f1_score'])
    
print('Mean Accuracy:', avg_accuracy)
print('Mean F1_Score:', avg_f1_score)

Mean Accuracy: {'knn': 0.9119901045717743, 'dtree': 0.9352324777460895, 'lsvm': 0.93504774868205, 'nnet': 0.9349883329012185, 'ada': 0.9349537637196439, 'rf': 0.9349829314665976}
Mean F1_Score: {'knn': 0.8670149258478942, 'dtree': 0.9032889910746137, 'lsvm': 0.9041997659327571, 'nnet': 0.9033760050448112, 'ada': 0.9034325412181637, 'rf': 0.9038309104997166}
