# DSP Project Version 3 with three classes (0,2,4), applying ensemble of Decision Tree, Gaussian Naive Bayes and Support Vector Machines

In [2]:
import keras
from keras.preprocessing import image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from tqdm import tqdm

Using TensorFlow backend.


In [3]:
data = pd.read_csv('train.csv')

In [4]:
data = data[data['diagnosis'].isin([0,2,4])]

In [5]:
data.describe()

Unnamed: 0,diagnosis
count,3099.0
mean,1.025492
std,1.327198
min,0.0
25%,0.0
50%,0.0
75%,2.0
max,4.0


In [6]:
print(data['diagnosis'].value_counts())

0    1805
2     999
4     295
Name: diagnosis, dtype: int64


In [7]:
data.reset_index(inplace = True)

In [8]:
data = data.drop(columns = 'index')

In [9]:
train_image = []
for i in tqdm(range(data.shape[0])):
    img = image.load_img('train_images/' + data['id_code'][i] + '.png', target_size=(64,64,3), grayscale=False)
    img = image.img_to_array(img)
    img = img/255
    train_image.append(img)
X = np.array(train_image)

100%|██████████| 3099/3099 [03:46<00:00, 13.47it/s]


In [10]:
X.shape

(3099, 64, 64, 3)

In [11]:
y = data.iloc[:, -1].values

In [12]:
nsamples, nx, ny, nz = X.shape
X = X.reshape((nsamples,nx*ny*nz))

In [13]:
from sklearn.metrics import confusion_matrix 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier().fit(X_train, y_train) 
y_pred_DT = dtree_model.predict(X_test) 

In [14]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB().fit(X_train, y_train) 
y_pred_NB = gnb.predict(X_test) 

In [15]:
from sklearn.svm import SVC 
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train) 
y_pred_svm = svm_model_linear.predict(X_test)

In [16]:
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
eclf = VotingClassifier(estimators=[('svc', svm_model_linear), ('dt', dtree_model), ('nb', gnb)],voting='hard',
                        flatten_transform=True)
labels = ['SVC', 'Decision tree', 'Gaussian NB', 'Ensemble']
for clf, label in zip([svm_model_linear, dtree_model, gnb, eclf], labels):
    scores = model_selection.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.83 (+/- 0.01) [SVC]
Accuracy: 0.78 (+/- 0.01) [Decision tree]
Accuracy: 0.65 (+/- 0.03) [Gaussian NB]
Accuracy: 0.82 (+/- 0.01) [Ensemble]


In [19]:
eclf.fit(X_train,y_train)
print("Ensemble Accuracy on Test Set: %.2f%%" % (eclf.score(X_test,y_test)*100.0))

Ensemble Accuracy on Test Set: 83.06%
