In [None]:
import pandas as pd
import numpy as np

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(10, 7)}, style='darkgrid')
sns.set_color_codes()

from scipy.stats import norm

import cv2

import os

import glob
import datetime
import itertools
import random

from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import warnings
warnings.filterwarnings('ignore')

## 10k

In [None]:
train_data = pd.read_csv("../input/nih-chest-x-ray-models/10k_train.csv")
train_data.index = [x for x in range(train_data.shape[0])]
train_data.head()

In [None]:
test_data = pd.read_csv("../input/nih-chest-x-ray-models/10k_test.csv")
test_data.index = [x for x in range(test_data.shape[0])]
test_data.head()

In [None]:
for i in range(train_data.shape[0]):
    if train_data["Finding Labels"][i] == "Cardiomegaly":
        train_data.drop(i, inplace=True)
        break

train_data.index = [x for x in range(train_data.shape[0])]

for i in range(train_data.shape[0]):
    if train_data["Finding Labels"][i] == "Consolidation":
        train_data.drop(i, inplace=True)
        break

train_data.index = [x for x in range(train_data.shape[0])]

In [None]:
train_data['Finding Labels'].value_counts()

In [None]:
test_data['Finding Labels'].value_counts()

In [None]:
train_data = train_data.sample(frac=1) # shuffle whole dataset
test_data = test_data.sample(frac=1) # shuffle whole dataset

In [None]:
train_data.index = [x for x in range(train_data.shape[0])]
test_data.index = [x for x in range(test_data.shape[0])]

In [None]:
pd.concat([train_data, test_data]).duplicated().sum() # any duplicate values?

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
def sharpening(img):
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    image_sharp = cv2.filter2D(src=img, ddepth=-1, kernel=kernel)
    return image_sharp

In [None]:
data = []
labels = []

for i in range(train_data.shape[0]):
    imagePath = train_data["path_gambar"][i]
    image = cv2.imread(imagePath)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = sharpening(image)
    image = np.uint16(image)
    image = cv2.resize(image, (224, 224)).flatten()
    image = image / 255.
    data.append(image)
    label = train_data["Finding Labels"][i]
    labels.append(label)
    if i % 100 == 0:
        print(i)

data = np.array(data)
labels = np.array(labels).flatten()

In [None]:
data[0], data.shape

In [None]:
test_thorax = []
test_labels = []

for i in range(test_data.shape[0]):
    imagePath = test_data["path_gambar"][i]
    image = cv2.imread(imagePath)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = sharpening(image)
    image = np.uint16(image)
    image = cv2.resize(image, (224, 224)).flatten()
    image = image / 255.
    test_thorax.append(image)
    label = test_data["Finding Labels"][i]
    test_labels.append(label)
    if i % 100 == 0:
        print(i)
#     if i == 15000:
#         break

X_test = np.array(test_thorax)
y_test = np.array(test_labels).flatten()

In [None]:
X_test[0], y_test.shape

In [None]:
def save_to_npy(filename, arr):
    with open(filename, 'wb') as f:
        np.save(f, arr)

save_to_npy("10k_imgs_train_sh.npy", data)
save_to_npy("10k_labels_train_sh.npy", labels)

save_to_npy("10k_imgs_test_sh.npy", test_thorax)
save_to_npy("10k_labels_test_sh.npy", test_labels)

In [None]:
# def load_saved_preds(filename):
#     with open(filename, 'rb') as f:
#         a = np.load(f)
#     return a

# data = load_saved_preds("../input/nih-chest-x-ray-models/10k_imgs_train.npy")
# labels = load_saved_preds("../input/nih-chest-x-ray-models/10k_labels_train.npy")

# X_test = load_saved_preds("../input/nih-chest-x-ray-models/10k_imgs_test.npy")
# y_test = load_saved_preds("../input/nih-chest-x-ray-models/10k_labels_test.npy")

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(data, labels)

In [None]:
knn.score(X_test, y_test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
rf.fit(data, labels)

In [None]:
rf.score(X_test, y_test)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier()
dt.fit(data, labels)

In [None]:
dt.score(X_test, y_test)

### SVM

Doesn't support multilabel classification

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()
svc.fit(data, labels)

In [None]:
svc.score(X_test, y_test)

## Evaluation

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [None]:
def get_metrics(y_true, y_preds):
    acc = accuracy_score(y_true, y_preds)
    rec = recall_score(y_true, y_preds, average="macro")
    prec = precision_score(y_true, y_preds, average="macro")
    f1 = f1_score(y_true, y_preds, average="macro")
    print(f"accuracy: {acc:.2f}")
    print(f"recall: {rec:.2f}")
    print(f"precision: {prec:.2f}")
    print(f"f1: {f1:.2f}")
    return acc, rec, prec, f1

In [None]:
accs = []
recs = []
precs = []
f1s = []

In [None]:
models = [svc, rf, knn, dt]
names = ["svc", "random_forest", "knn", "decision_tree"]

### Predictions on SVC

In [None]:
y_pred = svc.predict(X_test)
acc, rec, pre, f1 = get_metrics(y_test, y_pred)

accs.append(round(acc, 2)*100)
recs.append(round(rec, 2)*100)
precs.append(round(pre, 2)*100)
f1s.append(round(f1, 2)*100)

In [None]:
svc_cr = classification_report(y_test, y_pred, output_dict=True)

svc_cr = pd.DataFrame(svc_cr)
svc_cr

In [None]:
svc_cr.drop(["accuracy", "macro avg", "weighted avg"], axis=1, inplace=True)

In [None]:
svc_cr = svc_cr.T

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(svc_cr['precision'] * 100), x=svc_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("SVM - Precision Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(svc_cr['recall'] * 100), x=svc_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("SVM - Recall Score (sharpening) (10k)", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(svc_cr['f1-score'] * 100), x=svc_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("SVM - F1 Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

### Predictions on Random Forest

In [None]:
y_pred = rf.predict(X_test)
acc, rec, pre, f1 = get_metrics(y_test, y_pred)

accs.append(round(acc, 2)*100)
recs.append(round(rec, 2)*100)
precs.append(round(pre, 2)*100)
f1s.append(round(f1, 2)*100)

In [None]:
rf_cr = classification_report(y_test, y_pred, output_dict=True)

rf_cr = pd.DataFrame(rf_cr)
rf_cr

In [None]:
rf_cr.drop(["accuracy", "macro avg", "weighted avg"], axis=1, inplace=True)

In [None]:
rf_cr = rf_cr.T

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(rf_cr['precision'] * 100), x=rf_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("Random Forest - Precision Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(rf_cr['recall'] * 100), x=rf_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("Random Forest - Recall Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(rf_cr['f1-score'] * 100), x=rf_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("Random Forest - F1 Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

### Predictions on KNN

In [None]:
y_pred = knn.predict(X_test)
acc, rec, pre, f1 = get_metrics(y_test, y_pred)

accs.append(round(acc, 2)*100)
recs.append(round(rec, 2)*100)
precs.append(round(pre, 2)*100)
f1s.append(round(f1, 2)*100)

In [None]:
knn_cr = classification_report(y_test, y_pred, output_dict=True)

knn_cr = pd.DataFrame(knn_cr)
knn_cr

In [None]:
knn_cr.drop(["accuracy", "macro avg", "weighted avg"], axis=1, inplace=True)

In [None]:
knn_cr = knn_cr.T

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(knn_cr['precision'] * 100), x=knn_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("KNN - Precision Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(knn_cr['recall'] * 100), x=knn_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("KNN - Recall Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(knn_cr['f1-score'] * 100), x=knn_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("KNN - F1 Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

### Predictions on Decision Tree

In [None]:
y_pred = dt.predict(X_test)
acc, rec, pre, f1 = get_metrics(y_test, y_pred)

accs.append(round(acc, 2)*100)
recs.append(round(rec, 2)*100)
precs.append(round(pre, 2)*100)
f1s.append(round(f1, 2)*100)

In [None]:
dt_cr = classification_report(y_test, y_pred, output_dict=True)

dt_cr = pd.DataFrame(dt_cr)
dt_cr

In [None]:
dt_cr.drop(["accuracy", "macro avg", "weighted avg"], axis=1, inplace=True)

In [None]:
dt_cr = dt_cr.T

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(dt_cr['precision'] * 100), x=dt_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("Decision Tree - Precision Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(dt_cr['recall'] * 100), x=dt_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("Decision Tree - Recall Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(y=round(dt_cr['f1-score'] * 100), x=dt_cr.index, color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("Decision Tree - F1 Score (sharpening) - 10k images", fontsize=16, fontweight="bold");

### Visualize

In [None]:
acc_df = pd.DataFrame([accs], columns=names, index=["acc_score"]).T.sort_values(by="acc_score", ascending=False)
rec_df = pd.DataFrame([recs], columns=names, index=["recall_score"]).T.sort_values(by="recall_score", ascending=False)
prec_df = pd.DataFrame([precs], columns=names, index=["precision_score"]).T.sort_values(by="precision_score", ascending=False)
f1_df = pd.DataFrame([f1s], columns=names, index=["f1_score"]).T.sort_values(by="f1_score", ascending=False)

In [None]:
plt.figure(figsize=(7, 10))

ax = sns.barplot(x=acc_df.index, y=acc_df['acc_score'], color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);
    
plt.title("Accuracy Score (sharpening) (%) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(7, 10))

ax = sns.barplot(x=rec_df.index, y=rec_df['recall_score'], color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);
    
plt.title("Recall Score (sharpening) (%) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(7, 10))

ax = sns.barplot(x=prec_df.index, y=prec_df['precision_score'], color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);

plt.title("Precision Score (sharpening) (%) - 10k images", fontsize=16, fontweight="bold");

In [None]:
plt.figure(figsize=(7, 10))

ax = sns.barplot(x=f1_df.index, y=f1_df['f1_score'], color="brown");
for container in ax.containers:
    ax.bar_label(container)
plt.xticks(rotation=45);
    
plt.title("F1 Score (sharpening) (%) - 10k images", fontsize=16, fontweight="bold");