In [1]:
import random
import numpy as np
import pandas as pd
import os

import cv2
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, make_scorer

import warnings
warnings.filterwarnings('ignore')

### Train a Bayesian Classifier Model

In [2]:
# store the path to the directories with preprocessed png images
train_dir = '../input/siic-isic-224x224-images/train/'
test_dir = '../input/siic-isic-224x224-images/test/'

# load csv files with image name and metadata
train_df = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
test_df = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')

batch_size = 64
img_size = (224, 224)

def image_to_feature_vector(image, size=img_size):
    # flatten image into an array
    return cv2.resize(image, size).flatten()

def extract_color_histogram(image, bins=(8, 8, 8)):
    # 8 bins for each hue, saturation, and value channels
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # reorder colors
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    
    cv2.normalize(hist, hist)
    return hist.flatten()

In [3]:
%%time

features = []
histograms = []
labels = []

for img_name, label in zip(train_df['image_name'], train_df['target']):
    
    img_path = train_dir + img_name + '.png'
    image = cv2.imread(img_path)
    image = image_to_feature_vector(image, size=(32, 32))  # reduce image from (224, 224) to (32, 32)
    hist = extract_color_histogram(image)
    features.append(image)
    histograms.append(hist)
    labels.append(label)

CPU times: user 36.4 s, sys: 6.4 s, total: 42.8 s
Wall time: 59.4 s


In [4]:
features = np.array(features)
histograms = np.array(histograms)
labels = np.array(labels)

In [5]:
features.shape  # 150528 = 224 * 224 * 3

(33126, 3072)

In [6]:
print("[INFO] pixels matrix: {:.2f}MB".format(
    features.nbytes / (1024 * 1000.0)))
print("[INFO] features matrix: {:.2f}MB".format(
    histograms.nbytes / (1024 * 1000.0)))

[INFO] pixels matrix: 99.38MB
[INFO] features matrix: 66.25MB


In [7]:
%%time

X_train, X_val, y_train, y_val = train_test_split(features, labels, 
                                                  test_size=0.25, random_state=0)

# model = KNeighborsClassifier(n_jobs=-1) # use all cores
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)

acc = accuracy_score(y_val, y_pred)
print("Accuracy: ", acc)

roc = roc_auc_score(y_val, y_prob[:, 1])
print("ROC Score: ", roc)

Accuracy:  0.9607582709490461
ROC Score:  0.7321704766566675
CPU times: user 2min 36s, sys: 2.12 s, total: 2min 38s
Wall time: 26.9 s


### Performance of a Random Model

In [8]:
y_random = [1 if random.random() <= 0.5 else 0 for x in X_val]

acc = accuracy_score(y_val, y_random)
print("Accuracy: ", acc)

roc = roc_auc_score(y_val, y_random)
print("ROC Score: ", roc)

Accuracy:  0.496015455204057
ROC Score:  0.5125946728284605


Hosmer & Lemeshow (2013). Applied logistic regression. p.177:
"So, what area under the ROC curve describes good discrimination? Unfortunately there is no "magic" number, only general guidelines. In general, we use the following rule of thumb:

- 0.5 = This suggests no discrimination, so we might as well flip a coin.
- 0.5-0.7 = We consider this poor discrimination, not much better than a coin toss.
- 0.7-0.8 = Acceptable discrimination
- 0.8-0.9= Excellent discrimination
- 0.9 = Outstanding discrimination"

In medical diagnosis, very high AUCs (0.95 or higher are sought)

### Try Hyperparameter Tuning

In [9]:
%%time

X_train, X_val, y_train, y_val = train_test_split(features, labels, 
                                                  test_size=0.25, random_state=0)
model = LogisticRegression(max_iter=500)

scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)

params = {'penalty':['l1', 'l2'] , 'C': np.logspace(-3,3,7)}
grid = GridSearchCV(model, params, scoring=scorer)
grid.fit(X_train, y_train)

print("Best hyperparameters: ", grid.best_params_)
model = grid.best_estimator_

y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)

acc = accuracy_score(y_val, y_pred)
print("Accuracy: ", acc)

roc = roc_auc_score(y_val, y_prob[:, 1])
print("ROC Score: ", roc)

Best hyperparameters:  {'C': 0.001, 'penalty': 'l2'}
Accuracy:  0.9645013281815986
ROC Score:  0.7465899863249228
CPU times: user 1h 17min 34s, sys: 59.4 s, total: 1h 18min 34s
Wall time: 13min 16s


### Generate a submission file - csv file with 'image_name' and 'target'

In [16]:
%%time

test_features = []

for img_name in test_df['image_name']:
    img_path = test_dir + img_name + '.png'
    image = cv2.imread(img_path)
    image = image_to_feature_vector(image, size=(32, 32))  # reduce image from (224, 224) to (32, 32)
    test_features.append(image)
    
test_features = np.array(test_features)
y_test = model.predict_proba(test_features)[:, 1]

CPU times: user 11.3 s, sys: 1.99 s, total: 13.3 s
Wall time: 17.7 s


In [22]:
test = pd.DataFrame({'image_name': test_df['image_name'], 'target': y_test})

test.to_csv('../submissions/baseline.csv', index=False)  # 0.69