# Mammographic Mass Model performance comparison

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
data = pd.read_csv("mass.txt")

In [6]:
data.head()

Unnamed: 0,5,67,3,5.1,3.1,1
0,4,43,1,1,?,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,74,1,5,?,1
4,4,65,1,?,3,0


In [7]:
data = pd.read_csv("mass.txt" , na_values=["?"] , names = [ "BI-RADS assessment","Age","Shape","Margin","Density","Severity"])
data.head()

Unnamed: 0,BI-RADS assessment,Age,Shape,Margin,Density,Severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [8]:
data.describe()

Unnamed: 0,BI-RADS assessment,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [9]:
data.dropna(inplace=True)
data.describe()

Unnamed: 0,BI-RADS assessment,Age,Shape,Margin,Density,Severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [10]:
X = data[[ 'Age' , 'Shape', 'Margin','Density' ]].values
y = data['Severity'].values
features = [ 'Age ' , ' Shape', 'Margin','Density' ]
X

array([[67.,  3.,  5.,  3.],
       [58.,  4.,  5.,  3.],
       [28.,  1.,  1.,  3.],
       ...,
       [64.,  4.,  5.,  3.],
       [66.,  4.,  5.,  3.],
       [62.,  3.,  3.,  3.]])

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

splitting

In [12]:

from sklearn.model_selection import train_test_split
np.random.seed(1234)
X_train , X_test ,Y_train , Y_test = train_test_split(X,y,test_size = 0.25, random_state = 1)

Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier
dc = DecisionTreeClassifier(random_state=1)
dc.fit(X_train,Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [15]:
dc.score(X_test,Y_test)

0.7355769230769231

In [18]:
from sklearn.cross_validation import cross_val_score
cv = cross_val_score(dc, X,y,cv=10)
cv.mean()

0.7373556945552244

RandomForest classifer

In [20]:
from sklearn.ensemble import RandomForestClassifier
rc = RandomForestClassifier(n_estimators = 10 , random_state = 1)
RF = cross_val_score(rc, X,y,cv=10)
RF.mean()

0.7540496480696304

# SVM

In [26]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf',random_state=1, C=1.0)
SV = cross_val_score(svc, X,y,cv=10)
SV.mean()

0.8012023704574396

# KNN

In [25]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10, metric='minkowski', p=2)
KN = cross_val_score(knn, X,y,cv=10)
KN.mean()

0.7854795488574507

# Neural Networks

In [27]:
import keras
from keras.models import Sequential
from keras.layers import Dense

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [39]:
def create():
    model = Sequential()
    model.add(Dense( 6 , kernel_initializer='normal', activation ='relu', input_dim = 4))
    model.add(Dense( 1 , kernel_initializer='normal', activation ='sigmoid'))
    model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy' , metrics = ['accuracy'] )
    return model

In [40]:
from keras.wrappers.scikit_learn import KerasClassifier
estimate = KerasClassifier(build_fn = create , nb_epoch = 100 , verbose = 0)
score = cross_val_score(estimate, X,y,cv=10)
score.mean()

0.7710843362003924

# SVM would be model for selection based on accuracy of 80%