# DSP Project Version 1 with two classes (0,4) applying Decision Tree, Gaussian Naive Bayes and Support Vector Classifier to the data.

In [1]:
import keras
from keras.preprocessing import image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
data = pd.read_csv('train.csv')

In [3]:
data = data[data['diagnosis'].isin([0,4])]

In [5]:
data.reset_index(inplace = True)

In [6]:
data.head()

Unnamed: 0,index,id_code,diagnosis
0,1,001639a390f0,4
1,3,002c21358ce6,0
2,4,005b95c28852,0
3,5,0083ee8054ee,4
4,6,0097f532ac9f,0


In [7]:
data = data.drop(columns = 'index')

In [8]:
data['id_code'][0]

'001639a390f0'

In [9]:
data.shape

(2100, 2)

In [10]:
data.head()

Unnamed: 0,id_code,diagnosis
0,001639a390f0,4
1,002c21358ce6,0
2,005b95c28852,0
3,0083ee8054ee,4
4,0097f532ac9f,0


In [11]:
print(data['diagnosis'].value_counts())

0    1805
4     295
Name: diagnosis, dtype: int64


In [13]:
train_image = []
for i in tqdm(range(data.shape[0])):
    img = image.load_img('train_images/' + data['id_code'][i] + '.png', target_size=(64,64,3), grayscale=False)
    img = image.img_to_array(img)
    img = img/255
    train_image.append(img)
X = np.array(train_image)

100%|██████████| 2100/2100 [02:00<00:00, 17.36it/s]


In [14]:
X.shape

(2100, 64, 64, 3)

In [15]:
nsamples, nx, ny, nz = X.shape
X = X.reshape((nsamples,nx*ny*nz))

In [16]:
X.shape

(2100, 12288)

In [17]:
y = data['diagnosis'].values

In [18]:
y.shape

(2100,)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [20]:
X_train.shape

(1680, 12288)

In [21]:
from sklearn.metrics import confusion_matrix 

In [22]:
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier().fit(X_train, y_train) 
y_pred_DT = dtree_model.predict(X_test) 

In [23]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, accuracy_score, f1_score

print('Accuracy:', accuracy_score(y_test, y_pred_DT))
print('F1 score:', f1_score(y_test, y_pred_DT, average = 'macro'))
print('Recall:', recall_score(y_test, y_pred_DT, average = 'macro'))
print('Precision:', precision_score(y_test, y_pred_DT, average = 'macro'))
print('\n Classification report:\n', classification_report(y_test, y_pred_DT))
print('\n Confusion matrix:\n',confusion_matrix(y_test, y_pred_DT))

Accuracy: 0.9523809523809523
F1 score: 0.9089924160346695
Recall: 0.9197012138188609
Precision: 0.899116316434823

 Classification report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       357
           4       0.82      0.87      0.85        63

    accuracy                           0.95       420
   macro avg       0.90      0.92      0.91       420
weighted avg       0.95      0.95      0.95       420


 Confusion matrix:
 [[345  12]
 [  8  55]]


In [24]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB().fit(X_train, y_train) 
y_pred_NB = gnb.predict(X_test) 

In [25]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, accuracy_score, f1_score

#confusion_matrix(y_train.argmax(axis=1), y_train_pred.argmax(axis=1))
print('Accuracy:', accuracy_score(y_test, y_pred_NB))
print('F1 score:', f1_score(y_test, y_pred_NB, average = 'macro'))
print('Recall:', recall_score(y_test, y_pred_NB, average = 'macro'))
print('Precision:', precision_score(y_test, y_pred_NB, average = 'macro'))
print('\n Classification report:\n', classification_report(y_test, y_pred_NB))
print('\n Confusion matrix:\n',confusion_matrix(y_test, y_pred_NB))

Accuracy: 0.861904761904762
F1 score: 0.7761029411764706
Recall: 0.8403361344537815
Precision: 0.7443107465449554

 Classification report:
               precision    recall  f1-score   support

           0       0.96      0.87      0.91       357
           4       0.53      0.81      0.64        63

    accuracy                           0.86       420
   macro avg       0.74      0.84      0.78       420
weighted avg       0.90      0.86      0.87       420


 Confusion matrix:
 [[311  46]
 [ 12  51]]


In [26]:
from sklearn.svm import SVC 
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train) 
y_pred_svm = svm_model_linear.predict(X_test) 

In [27]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, accuracy_score, f1_score

#confusion_matrix(y_train.argmax(axis=1), y_train_pred.argmax(axis=1))
print('Accuracy:', accuracy_score(y_test, y_pred_svm))
print('F1 score:', f1_score(y_test, y_pred_svm, average = 'macro'))
print('Recall:', recall_score(y_test, y_pred_svm, average = 'macro'))
print('Precision:', precision_score(y_test, y_pred_svm, average = 'macro'))
print('\n Classification report:\n', classification_report(y_test, y_pred_svm))
print('\n Confusion matrix:\n',confusion_matrix(y_test, y_pred_svm))

Accuracy: 0.9523809523809523
F1 score: 0.90538835826275
Recall: 0.9000933706816059
Precision: 0.9109091739348829

 Classification report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       357
           4       0.85      0.83      0.84        63

    accuracy                           0.95       420
   macro avg       0.91      0.90      0.91       420
weighted avg       0.95      0.95      0.95       420


 Confusion matrix:
 [[348   9]
 [ 11  52]]
