In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image
import cv2

In [None]:
!rm -rf `find -type d -name .ipynb_checkpoints`

In [None]:
PATH = "../data/"
IMAGE_PATH = os.path.join(PATH,"images/")

In [None]:
train_data = pd.read_csv(os.path.join(PATH,"train.csv"))
test_data = pd.read_csv(os.path.join(PATH,"test.csv"))

## Data Statistics

In [None]:
heading = ['No. of training images', 'No. of test images']
vals = [len(train_data), len(test_data)]
plt.bar(heading, vals, color=['red', 'blue'])
for idx, val in enumerate(vals):
    plt.text(idx, val, str(val))
plt.title("No. of Images")
plt.show()

In [None]:
heading = train_data.columns[1:]
vals = [ sum(train_data[col] > 0) for col in heading]
plt.bar(heading, vals, color=['red', 'blue', 'green', 'orange'])
for idx, val in enumerate(vals):
    plt.text(idx, val, str(val))
plt.title("No. of Images in each Label")
plt.show()

All the classes except `multiple_diseases` are fairly distributed, with each of them occupying approximately 30% of the dataset, indicating the problem of class imbalance

## Example Images in each class

In [None]:
def random_image(data, label):
    c = np.random.choice(data[data[label]>0]['image_id'])
    path = IMAGE_PATH + c + '.jpg'
    return cv2.imread(path)

In [None]:
fig, axarr = plt.subplots(2,2, figsize=(10,8))
axarr[0,0].imshow(random_image(train_data, heading[0]))
axarr[0,0].set_title(heading[0])
axarr[0,1].imshow(random_image(train_data, heading[1]))
axarr[0,1].set_title(heading[1])
axarr[1,0].imshow(random_image(train_data, heading[2]))
axarr[1,0].set_title(heading[2])
axarr[1,1].imshow(random_image(train_data, heading[3]))
axarr[1,1].set_title(heading[3])
fig.suptitle("Example Images", fontsize=16)
plt.show()

## Channel Distribution

In [None]:
def plot_colour_hist(df, title):
    red_values = []; green_values = []; blue_values = []; all_channels = []
    for _, row in df.iterrows():
        img = np.array(Image.open(IMAGE_PATH + row.image_id + '.jpg'))
        red_values.append(np.mean(img[:, :, 2]))
        green_values.append(np.mean(img[:, :, 1]))
        blue_values.append(np.mean(img[:, :, 0]))
        all_channels.append(np.mean(img))
        
    _, axes = plt.subplots(ncols=4, nrows=1, constrained_layout=True, figsize=(16, 3), sharey=True)
    for ax, column, vals, c in zip(
        axes,
        ['red', 'green', 'blue', 'all colours'],
        [red_values, green_values, blue_values, all_channels],
        'rgbk'
    ):
        ax.hist(vals, bins=100, color=c)
        ax.set_title(f'{column} hist')

    plt.suptitle(title)
    plt.show()

In [None]:
plot_colour_hist(train_data, title='Train colour dist')

All the distributions seem to follow normal distribution with slight positive/negative skew

## t-SNE Plot

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, learning_rate=100)
num_examples = 200

In [None]:
X = [cv2.imread(IMAGE_PATH + c + '.jpg').flatten() for c in train_data['image_id'][:num_examples]]
target_ids = np.where(np.array(train_data[heading][:num_examples]))[1]

In [None]:
X_tsne = tsne.fit_transform(X)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(6, 5))
colors = ['r', 'g', 'b', 'c']
for i, c, label in zip(target_ids, colors, heading):
    plt.scatter(X_tsne[target_ids == i, 0], X_tsne[target_ids == i, 1], c=c, label=label)
plt.legend()
plt.show()

No appreciable information can be infered from the t-SNE plot, though it may require more tuning of the parameters / dimensionality reduction by PCA to remove noise