# DSP Project Version 4 with all classes, applying Decision Tree, Gaussian Naive Bayes and Support Vector Machines

In [1]:
import keras
from keras.preprocessing import image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.describe()

Unnamed: 0,diagnosis
count,3662.0
mean,1.12698
std,1.298409
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,4.0


In [4]:
print(data['diagnosis'].value_counts())

0    1805
2     999
1     370
4     295
3     193
Name: diagnosis, dtype: int64


In [5]:
data.reset_index(inplace = True)

In [6]:
data = data.drop(columns = 'index')

In [7]:
train_image = []
for i in tqdm(range(data.shape[0])):
    img = image.load_img('train_images/' + data['id_code'][i] + '.png', target_size=(64,64,3), grayscale=False)
    img = image.img_to_array(img)
    img = img/255
    train_image.append(img)
X = np.array(train_image)

100%|██████████| 3662/3662 [04:46<00:00, 12.94it/s]


In [8]:
X.shape

(3662, 64, 64, 3)

In [9]:
y = data.iloc[:, -1].values

In [10]:
nsamples, nx, ny, nz = X.shape
X = X.reshape((nsamples,nx*ny*nz))

In [11]:
from sklearn.metrics import confusion_matrix 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier().fit(X_train, y_train) 
y_pred_DT = dtree_model.predict(X_test) 

In [12]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, accuracy_score, f1_score

print('Accuracy:', accuracy_score(y_test, y_pred_DT))
print('F1 score:', f1_score(y_test, y_pred_DT, average = 'macro'))
print('Recall:', recall_score(y_test, y_pred_DT, average = 'macro'))
print('Precision:', precision_score(y_test, y_pred_DT, average = 'macro'))
print('\n Classification report:\n', classification_report(y_test, y_pred_DT))
print('\n Confusion matrix:\n',confusion_matrix(y_test, y_pred_DT))

Accuracy: 0.6753069577080492
F1 score: 0.46572453986965073
Recall: 0.46423604472486
Precision: 0.46820644129303346

 Classification report:
               precision    recall  f1-score   support

           0       0.91      0.92      0.91       351
           1       0.35      0.37      0.36        68
           2       0.59      0.58      0.58       213
           3       0.27      0.22      0.24        36
           4       0.23      0.23      0.23        65

    accuracy                           0.68       733
   macro avg       0.47      0.46      0.47       733
weighted avg       0.67      0.68      0.67       733


 Confusion matrix:
 [[324   9  13   2   3]
 [  7  25  23   6   7]
 [ 15  30 123  11  34]
 [  3   2  17   8   6]
 [  9   6  32   3  15]]


In [13]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB().fit(X_train, y_train) 
y_pred_NB = gnb.predict(X_test) 

In [14]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, accuracy_score, f1_score

#confusion_matrix(y_train.argmax(axis=1), y_train_pred.argmax(axis=1))
print('Accuracy:', accuracy_score(y_test, y_pred_NB))
print('F1 score:', f1_score(y_test, y_pred_NB, average = 'macro'))
print('Recall:', recall_score(y_test, y_pred_NB, average = 'macro'))
print('Precision:', precision_score(y_test, y_pred_NB, average = 'macro'))
print('\n Classification report:\n', classification_report(y_test, y_pred_NB))
print('\n Confusion matrix:\n',confusion_matrix(y_test, y_pred_NB))

Accuracy: 0.4870395634379263
F1 score: 0.31645888614309664
Recall: 0.38287888551351684
Precision: 0.39349308286848583

 Classification report:
               precision    recall  f1-score   support

           0       0.92      0.73      0.82       351
           1       0.17      0.68      0.27        68
           2       0.58      0.20      0.29       213
           3       0.10      0.28      0.15        36
           4       0.20      0.03      0.05        65

    accuracy                           0.49       733
   macro avg       0.39      0.38      0.32       733
weighted avg       0.65      0.49      0.51       733


 Confusion matrix:
 [[257  49   7  36   2]
 [  6  46   8   6   2]
 [ 10 129  42  28   4]
 [  2  17   7  10   0]
 [  4  33   9  17   2]]


In [15]:
from sklearn.svm import SVC 
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train) 
y_pred_svm = svm_model_linear.predict(X_test)

In [16]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, accuracy_score, f1_score

#confusion_matrix(y_train.argmax(axis=1), y_train_pred.argmax(axis=1))
print('Accuracy:', accuracy_score(y_test, y_pred_svm))
print('F1 score:', f1_score(y_test, y_pred_svm, average = 'macro'))
print('Recall:', recall_score(y_test, y_pred_svm, average = 'macro'))
print('Precision:', precision_score(y_test, y_pred_svm, average = 'macro'))
print('\n Classification report:\n', classification_report(y_test, y_pred_svm))
print('\n Confusion matrix:\n',confusion_matrix(y_test, y_pred_svm))

Accuracy: 0.7148703956343793
F1 score: 0.4668087830058242
Recall: 0.46341974757881965
Precision: 0.4878931661143929

 Classification report:
               precision    recall  f1-score   support

           0       0.91      0.96      0.94       351
           1       0.36      0.38      0.37        68
           2       0.61      0.68      0.64       213
           3       0.24      0.11      0.15        36
           4       0.32      0.18      0.24        65

    accuracy                           0.71       733
   macro avg       0.49      0.46      0.47       733
weighted avg       0.69      0.71      0.70       733


 Confusion matrix:
 [[338   4   8   0   1]
 [  8  26  29   2   3]
 [ 15  30 144   7  17]
 [  2   5  21   4   4]
 [  7   8  34   4  12]]
