# DSP Project Version 2 with three classes (0,2,4), applying Decidion Tree, Gaussian Naive Bayes and Support Vector Machines.

In [1]:
import keras
from keras.preprocessing import image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
data = pd.read_csv('train.csv')

In [3]:
data = data[data['diagnosis'].isin([0,2,4])]

In [4]:
data.describe()

Unnamed: 0,diagnosis
count,3099.0
mean,1.025492
std,1.327198
min,0.0
25%,0.0
50%,0.0
75%,2.0
max,4.0


In [5]:
print(data['diagnosis'].value_counts())

0    1805
2     999
4     295
Name: diagnosis, dtype: int64


In [6]:
data.reset_index(inplace = True)

In [7]:
data.head()

Unnamed: 0,index,id_code,diagnosis
0,0,000c1434d8d7,2
1,1,001639a390f0,4
2,3,002c21358ce6,0
3,4,005b95c28852,0
4,5,0083ee8054ee,4


In [8]:
data = data.drop(columns = 'index')

In [9]:
train_image = []
for i in tqdm(range(data.shape[0])):
    img = image.load_img('train_images/' + data['id_code'][i] + '.png', target_size=(64,64,3), grayscale=False)
    img = image.img_to_array(img)
    img = img/255
    train_image.append(img)
X = np.array(train_image)

100%|██████████| 3099/3099 [03:48<00:00, 13.45it/s]


In [10]:
nsamples, nx, ny, nz = X.shape
X = X.reshape((nsamples,nx*ny*nz))

In [11]:
X.shape

(3099, 12288)

In [12]:
y = data.iloc[:, -1].values

In [13]:
from sklearn.metrics import confusion_matrix 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [15]:
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier().fit(X_train, y_train) 
y_pred_DT = dtree_model.predict(X_test) 

In [16]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, accuracy_score, f1_score

print('Accuracy:', accuracy_score(y_test, y_pred_DT))
print('F1 score:', f1_score(y_test, y_pred_DT, average = 'macro'))
print('Recall:', recall_score(y_test, y_pred_DT, average = 'macro'))
print('Precision:', precision_score(y_test, y_pred_DT, average = 'macro'))
print('\n Classification report:\n', classification_report(y_test, y_pred_DT))
print('\n Confusion matrix:\n',confusion_matrix(y_test, y_pred_DT))

Accuracy: 0.7983870967741935
F1 score: 0.6608191930778322
Recall: 0.6620335103339056
Precision: 0.6605533590346804

 Classification report:
               precision    recall  f1-score   support

           0       0.90      0.92      0.91       363
           2       0.73      0.69      0.71       207
           4       0.35      0.38      0.37        50

    accuracy                           0.80       620
   macro avg       0.66      0.66      0.66       620
weighted avg       0.80      0.80      0.80       620


 Confusion matrix:
 [[334  25   4]
 [ 34 142  31]
 [  4  27  19]]


In [17]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB().fit(X_train, y_train) 
y_pred_NB = gnb.predict(X_test) 

In [18]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, accuracy_score, f1_score

#confusion_matrix(y_train.argmax(axis=1), y_train_pred.argmax(axis=1))
print('Accuracy:', accuracy_score(y_test, y_pred_NB))
print('F1 score:', f1_score(y_test, y_pred_NB, average = 'macro'))
print('Recall:', recall_score(y_test, y_pred_NB, average = 'macro'))
print('Precision:', precision_score(y_test, y_pred_NB, average = 'macro'))
print('\n Classification report:\n', classification_report(y_test, y_pred_NB))
print('\n Confusion matrix:\n',confusion_matrix(y_test, y_pred_NB))

Accuracy: 0.6387096774193548
F1 score: 0.5141536309377758
Recall: 0.594163239775888
Precision: 0.6187604200192798

 Classification report:
               precision    recall  f1-score   support

           0       0.89      0.84      0.86       363
           2       0.81      0.29      0.42       207
           4       0.16      0.66      0.26        50

    accuracy                           0.64       620
   macro avg       0.62      0.59      0.51       620
weighted avg       0.80      0.64      0.67       620


 Confusion matrix:
 [[304   2  57]
 [ 34  59 114]
 [  5  12  33]]


In [19]:
from sklearn.svm import SVC 
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train) 
y_pred_svm = svm_model_linear.predict(X_test) 

In [20]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, accuracy_score, f1_score

#confusion_matrix(y_train.argmax(axis=1), y_train_pred.argmax(axis=1))
print('Accuracy:', accuracy_score(y_test, y_pred_svm))
print('F1 score:', f1_score(y_test, y_pred_svm, average = 'macro'))
print('Recall:', recall_score(y_test, y_pred_svm, average = 'macro'))
print('Precision:', precision_score(y_test, y_pred_svm, average = 'macro'))
print('\n Classification report:\n', classification_report(y_test, y_pred_svm))
print('\n Confusion matrix:\n',confusion_matrix(y_test, y_pred_svm))

Accuracy: 0.832258064516129
F1 score: 0.6912106887438733
Recall: 0.6866413808706299
Precision: 0.6965502087453307

 Classification report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       363
           2       0.77      0.76      0.77       207
           4       0.40      0.36      0.38        50

    accuracy                           0.83       620
   macro avg       0.70      0.69      0.69       620
weighted avg       0.83      0.83      0.83       620


 Confusion matrix:
 [[340  20   3]
 [ 25 158  24]
 [  5  27  18]]
