### Train a KNN Model

In [7]:
import random
import numpy as np
import pandas as pd
import os

import cv2
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [8]:
# store the path to the directories with preprocessed png images
train_dir = '../input/siic-isic-224x224-images/train/'
test_dir = '../input/siic-isic-224x224-images/test/'

# load csv files with image name and metadata
train_df = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
test_df = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')

batch_size = 64
img_size = (224, 224)

def image_to_feature_vector(image, size=img_size):
    # flatten image into an array
    return cv2.resize(image, size).flatten()

def extract_color_histogram(image, bins=(8, 8, 8)):
    # 8 bins for each hue, saturation, and value channels
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    
    cv2.normalize(hist, hist)
    return hist.flatten()

In [9]:
features = []
labels = []

for img_name, label in zip(train_df['image_name'], train_df['target']):
    
    img_path = train_dir + img_name + '.png'
    image = cv2.imread(img_path)
    image = image_to_feature_vector(image)
    features.append(image)
    labels.append(label)
    
features = np.array(features)
labels = np.array(labels)

In [10]:
features.shape  # 150528 = 224 * 224 * 3

(33126, 150528)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(features, labels, 
                                                  test_size=0.25, random_state=0)
model = KNeighborsClassifier()
model.fit(X_train, y_train)

acc = model.score(X_val, y_val)
print(acc)

y_pred = model.predict(X_val)
roc = roc_auc_score(y_val, y_pred)
print(roc)

### Performance of a Random Model

In [None]:
y_train.value_counts()

