In [70]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score

data = pd.read_csv('mammographic_masses.data.txt', na_values=['?'], names=['BI_RADS', 'age', 'shape', 'margin', 'density', 'severity'])
data.head()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [71]:
print(data.describe())
data.dropna(inplace=True)
print(data.describe())


          BI_RADS         age       shape      margin     density    severity
count  959.000000  956.000000  930.000000  913.000000  885.000000  961.000000
mean     4.348279   55.487448    2.721505    2.796276    2.910734    0.463059
std      1.783031   14.480131    1.242792    1.566546    0.380444    0.498893
min      0.000000   18.000000    1.000000    1.000000    1.000000    0.000000
25%      4.000000   45.000000    2.000000    1.000000    3.000000    0.000000
50%      4.000000   57.000000    3.000000    3.000000    3.000000    0.000000
75%      5.000000   66.000000    4.000000    4.000000    3.000000    1.000000
max     55.000000   96.000000    4.000000    5.000000    4.000000    1.000000
          BI_RADS         age       shape      margin     density    severity
count  830.000000  830.000000  830.000000  830.000000  830.000000  830.000000
mean     4.393976   55.781928    2.781928    2.813253    2.915663    0.485542
std      1.888371   14.671782    1.242361    1.567175    0.35093

In [72]:
X = data[['age', 'shape', 'margin', 'density']].values
Y = data['severity'].values
paramNames = ['age', 'shape', 'margin', 'density']

In [73]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
XScaled = scaler.fit_transform(X)
XScaled

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]],
      shape=(830, 4))

In [74]:
from sklearn.model_selection import train_test_split
np.random.seed(1234)

trainX, testX, trainY, testY = train_test_split(XScaled, Y, train_size=0.75, random_state=1)

Logistic Regression

In [75]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
score = cross_val_score(clf, XScaled, Y, cv=10)
score.mean()

np.float64(0.8072289156626505)

Desicion Tree

In [76]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=1)
clf.fit(trainX, trainY)
clf.score(testX, testY)

0.7355769230769231

In [77]:

clf = DecisionTreeClassifier(random_state=1)
cv_score = cross_val_score(clf, XScaled, Y, cv=10)
cv_score.mean()

np.float64(0.7373493975903613)

RandomForest

In [78]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10, random_state=1)
score = cross_val_score(clf, XScaled, Y, cv=10)
cv_score.mean()

np.float64(0.7373493975903613)

KNN

In [79]:
from sklearn import neighbors

for n in range(1,21):
    clf = neighbors.KNeighborsClassifier(n_neighbors=n)
    score = cross_val_score(clf,XScaled,Y,cv=10)
    print(n, score.mean())


1 0.7228915662650601
2 0.6855421686746987
3 0.7530120481927711
4 0.7385542168674699
5 0.7783132530120482
6 0.7650602409638554
7 0.7975903614457832
8 0.7819277108433734
9 0.7927710843373493
10 0.7927710843373494
11 0.7951807228915662
12 0.7843373493975905
13 0.7843373493975904
14 0.7855421686746988
15 0.7855421686746988
16 0.7831325301204819
17 0.7867469879518072
18 0.7783132530120482
19 0.7855421686746988
20 0.7843373493975904


SVM

In [80]:
from sklearn import svm
C = 1.0

svc = svm.SVC(kernel='linear', C=C)
score = cross_val_score(svc, XScaled, Y, cv=10)
score.mean()

np.float64(0.7975903614457832)

In [81]:
svc = svm.SVC(kernel='rbf', C=C)
score = cross_val_score(svc, XScaled, Y, cv=10)
score.mean()

np.float64(0.8012048192771084)

In [82]:
svc = svm.SVC(kernel='sigmoid', C=C)
score = cross_val_score(svc, XScaled, Y, cv=10)
score.mean()

np.float64(0.7457831325301204)

In [83]:
svc = svm.SVC(kernel='poly', C=C)
score = cross_val_score(svc, XScaled, Y, cv=10)
score.mean()

np.float64(0.7903614457831326)

Naive Bayes

In [84]:
from sklearn.naive_bayes import MultinomialNB
scaler2 = preprocessing.MinMaxScaler()
XMinMax = scaler2.fit_transform(X)

clf = MultinomialNB()
score = cross_val_score(clf, XMinMax, Y, cv=10)

score.mean()

np.float64(0.7855421686746988)