In [76]:
import pandas as pd
import numpy as np

feature_names =  ['Bi-Rads', 'Age', 'Shape', 'Margin', 'Density', 'Severity']

tumor_data = pd.read_csv('MLCourse/mammographic_masses.data.txt', na_values=['?'], 
                          names = feature_names)

7. Attribute Information:
   1. BI-RADS assessment: 1 to 5 (ordinal)  
   2. Age: patient's age in years (integer)
   3. Shape: mass shape: round=1 oval=2 lobular=3 irregular=4 (nominal)
   4. Margin: mass margin: circumscribed=1 microlobulated=2 obscured=3 ill-defined=4 spiculated=5 (nominal)
   5. Density: mass density high=1 iso=2 low=3 fat-containing=4 (ordinal)
   6. Severity: benign=0 or malignant=1 (binominal)

In [77]:
tumor_data.head()

Unnamed: 0,Bi-Rads,Age,Shape,Margin,Density,Severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [78]:
tumor_data.describe()

Unnamed: 0,Bi-Rads,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [79]:
dropped_rows = tumor_data.dropna()

In [80]:
dropped_rows.describe()

Unnamed: 0,Bi-Rads,Age,Shape,Margin,Density,Severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [81]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



features = dropped_rows[feature_names].drop(['Severity', 'Bi-Rads'], axis=1).values

scaler = StandardScaler()
features = scaler.fit_transform(features)

labels = dropped_rows['Severity']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=420)


In [82]:
X_train

array([[-0.46252165,  0.98104077,  0.75770912,  0.24046607],
       [-1.07631393, -1.43517241,  0.75770912,  0.24046607],
       [ 0.56046548,  0.98104077,  0.75770912, -2.6107745 ],
       ...,
       [-0.66711907,  0.98104077,  0.75770912, -2.6107745 ],
       [-0.59891993,  0.98104077,  1.39618483,  0.24046607],
       [-1.55370792,  0.98104077,  0.75770912,  0.24046607]])

In [83]:
dropped_rows.corr()

Unnamed: 0,Bi-Rads,Age,Shape,Margin,Density,Severity
Bi-Rads,1.0,0.094623,0.18012,0.157771,0.028356,0.223826
Age,0.094623,1.0,0.380096,0.420913,0.052417,0.455216
Shape,0.18012,0.380096,1.0,0.738014,0.073969,0.564763
Margin,0.157771,0.420913,0.738014,1.0,0.12486,0.574498
Density,0.028356,0.052417,0.073969,0.12486,1.0,0.068651
Severity,0.223826,0.455216,0.564763,0.574498,0.068651,1.0


# Cross-Val setup

In [111]:
from sklearn.model_selection import cross_val_score

scores = {}

def cross_eval(clf, model_name):
    scores.update({model_name : cross_val_score(clf, X_train, y_train, cv=10)})
    return (scores[model_name].mean())

# Decision Trees

In [112]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [113]:
cross_eval(dt_clf, 'decision_tree')

0.7393260967887834

# Random Forest

In [114]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
cross_eval(rf_clf, 'random_forest')

0.7693351424694709

# SVM

In [115]:
from sklearn.svm import SVC

svc_clf = SVC()
svc_clf.fit(X_train, y_train)
cross_eval(svc_clf, 'SVC')

0.7965174129353233

# KNN

In [117]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
cross_eval(knn_clf, 'KNN')

0.8010628674807778

In [125]:
from sklearn.model_selection import GridSearchCV

param = {'n_neighbors':[i for i in range(1, 101)]}

grid_search = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param,
    cv=5,
    scoring='accuracy',
)

grid_result = grid_search.fit(X_train, y_train)
best_params = grid_result.best_params_

best_params

{'n_neighbors': 50}

In [126]:
knn_clf_50 = KNeighborsClassifier(n_neighbors=50)
knn_clf_50.fit(X_train, y_train)
cross_eval(knn_clf_50, 'KNN_50')

0.7950474898236093

# Naive Bayes

In [133]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

unscaled_X = scaler.inverse_transform(X_train)

mm_scaler = MinMaxScaler()
mm_scaled_X = mm_scaler.fit_transform(unscaled_X)

naive_clf = MultinomialNB()
naive_clf.fit(mm_scaled_X, y_train)

scores.update({'naive_bayes' : cross_val_score(naive_clf, mm_scaled_X, y_train, cv=10)})
scores['naive_bayes'].mean()

0.7800316598824062

# More SVM

In [156]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param = {'kernel':['linear','poly','rbf','sigmoid']}

grid_search = GridSearchCV(
    estimator=SVC(),
    param_grid=param,
    cv=10,
)

grid_result = grid_search.fit(X_train, y_train)
best_params = grid_result.best_params_

In [157]:
best_params

{'kernel': 'linear'}

In [158]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

0.799502487562189 {'kernel': 'linear'}
0.7875621890547263 {'kernel': 'poly'}
0.7965174129353233 {'kernel': 'rbf'}
0.7319086386250565 {'kernel': 'sigmoid'}


# Logistic Regression

In [189]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)

cross_eval(log_clf, 'logistic')

0.7935549525101764

# Neural Networks

In [184]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop


def create_model():
    model = Sequential()
    model.add(Dense(64, activation='relu', kernel_initializer='normal', input_dim=4))
    model.add(Dropout(0.70))
    model.add(Dense(32, activation='relu', kernel_initializer='normal'))
    model.add(Dropout(0.50))
    model.add(Dense(16, activation='relu', kernel_initializer='normal'))
    model.add(Dense(8, activation='relu', kernel_initializer='normal'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])
    
    return model

In [185]:
test_model = create_model()

history = test_model.fit(X_train, y_train,
                    batch_size=30,
                    epochs=50,
                    verbose=2,
                    validation_data=(X_test, y_test)
                   )

Epoch 1/50
23/23 - 0s - loss: 0.6881 - accuracy: 0.6265 - val_loss: 0.6746 - val_accuracy: 0.7530
Epoch 2/50
23/23 - 0s - loss: 0.6528 - accuracy: 0.6928 - val_loss: 0.6235 - val_accuracy: 0.7530
Epoch 3/50
23/23 - 0s - loss: 0.6173 - accuracy: 0.7470 - val_loss: 0.5987 - val_accuracy: 0.7651
Epoch 4/50
23/23 - 0s - loss: 0.5886 - accuracy: 0.7786 - val_loss: 0.5867 - val_accuracy: 0.7711
Epoch 5/50
23/23 - 0s - loss: 0.5896 - accuracy: 0.7937 - val_loss: 0.5735 - val_accuracy: 0.7771
Epoch 6/50
23/23 - 0s - loss: 0.5696 - accuracy: 0.8117 - val_loss: 0.5577 - val_accuracy: 0.7651
Epoch 7/50
23/23 - 0s - loss: 0.5521 - accuracy: 0.8087 - val_loss: 0.5377 - val_accuracy: 0.7831
Epoch 8/50
23/23 - 0s - loss: 0.5329 - accuracy: 0.8012 - val_loss: 0.5130 - val_accuracy: 0.7952
Epoch 9/50
23/23 - 0s - loss: 0.4958 - accuracy: 0.8087 - val_loss: 0.4865 - val_accuracy: 0.8012
Epoch 10/50
23/23 - 0s - loss: 0.4836 - accuracy: 0.8102 - val_loss: 0.4660 - val_accuracy: 0.8072
Epoch 11/50
23/23 -

In [186]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

estimator = KerasClassifier(build_fn=create_model, epochs=100, verbose=0)

cross_eval(estimator, 'neural_network')

0.8070782423019409

In [188]:
scores

{'decision_tree': array([0.68656716, 0.80597015, 0.7761194 , 0.7761194 , 0.74242424,
        0.65151515, 0.6969697 , 0.78787879, 0.71212121, 0.75757576]),
 'random_forest': array([0.73134328, 0.8358209 , 0.86567164, 0.80597015, 0.74242424,
        0.68181818, 0.71212121, 0.78787879, 0.77272727, 0.75757576]),
 'SVC': array([0.79104478, 0.8358209 , 0.82089552, 0.85074627, 0.6969697 ,
        0.81818182, 0.78787879, 0.74242424, 0.83333333, 0.78787879]),
 'KNN': array([0.76119403, 0.85074627, 0.79104478, 0.89552239, 0.74242424,
        0.8030303 , 0.75757576, 0.8030303 , 0.81818182, 0.78787879]),
 'KNN_50': array([0.80597015, 0.8358209 , 0.79104478, 0.8358209 , 0.72727273,
        0.8030303 , 0.74242424, 0.75757576, 0.83333333, 0.81818182]),
 'naive_bayes': array([0.7761194 , 0.7761194 , 0.80597015, 0.82089552, 0.71212121,
        0.8030303 , 0.74242424, 0.77272727, 0.78787879, 0.8030303 ]),
 'neural_network': array([0.76119405, 0.82089549, 0.86567163, 0.880597  , 0.71212119,
        0.818