In [None]:
import pandas as pd
import os
from collections import Counter
import matplotlib.pyplot as plt
from PIL import Image
os.chdir('..')

# Dataset info

In [None]:
file_path = 'Dataset/public/data.json'

df = pd.read_json(file_path)

print(df.info())

In [None]:
print(df.head(5))

Check null value

In [None]:
print(df.isnull().sum())

Check for duplicates in the dataset

In [None]:
print(df.duplicated().sum())

In [None]:
df.describe()

In [None]:
print(df.columns.to_list())

In [None]:
print(df['Classification'].unique())
print(len(df['Classification'].unique()))

There are 7 types in the classification column.

In [None]:
print(df['Classification'].value_counts())
df['Classification'].value_counts().plot(kind='bar')

In [None]:
print(df['Type'].unique())
print(len(df['Type'].unique()))

Based on the dataset usage guideline, the is 2 type condition, normal and abnormal, but the dataset only has 566 normal type and 3 empty type (?).

Based on the Description , I guess they are abnornal type. So all of the data is abnormal type, no normal type (?)

In [None]:
df.loc[df['Type'] == '', 'Type'] = 'abnormal'
print(df['Type'].unique())
print(len(df['Type'].unique()))

# Visualization

Show 9 images randomly

In [None]:
sample_df = df.sample(9)
plt.figure(figsize=(12, 12))
for idx, (_, row) in enumerate(sample_df.iterrows()):
  img_path = os.path.join('Dataset/public/images', row['Path'])
  img = Image.open(img_path)
  plt.subplot(3, 3, idx + 1)
  plt.imshow(img)
  plt.title(row['Classification'])
  desc = f"Type: {row['Type']}\n" + str(row['Description']).replace('\r\n', '\n')
  desc_en = str(row['DescriptionEN']).replace('\r\n', '\n')
  plt.text(0.5, -0.1, f"{desc}\n{desc_en}", 
      fontsize=9, color='black', ha='center', va='top', transform=plt.gca().transAxes, wrap=True)
  plt.axis('off')
plt.tight_layout()
plt.show()

Ear left and ear right

In [None]:
def compare_2_type_of_img(df: pd.DataFrame, type_1:str, type_2:str, num_of_samples: int = 4):

  ear_left_imgs = df[df['Classification'] == type_1]['Path'].head(num_of_samples).tolist()
  ear_right_imgs = df[df['Classification'] == type_2]['Path'].head(num_of_samples).tolist()

  fig, axes = plt.subplots(2, 4, figsize=(16, 8))

  for i, img_name in enumerate(ear_left_imgs):
    img_path = os.path.join('Dataset/public/images', img_name)
    img = Image.open(img_path)
    axes[0, i].imshow(img)
    axes[0, i].set_title(type_1)
    axes[0, i].axis('off')

  for i, img_name in enumerate(ear_right_imgs):
    img_path = os.path.join('Dataset/public/images', img_name)
    img = Image.open(img_path)
    axes[1, i].imshow(img)
    axes[1, i].set_title(type_2)
    axes[1, i].axis('off')

  plt.tight_layout()
  plt.show()

compare_2_type_of_img(df, 'ear-left', 'ear-right')

Nose left and nose right

In [None]:
compare_2_type_of_img(df, 'nose-left', 'nose-right')

vc-closed ans vs-open

In [None]:
compare_2_type_of_img(df, 'vc-open', 'vc-closed')

# Important: There is a miss match between the actual path and the path in `data.json`

The actual has postfix `_ImageXX.png` but the path in the **data.json** has postfix `_imageXX.png`. We need to convert it before tranning.

In [None]:
df["Path"] = df["Path"].str.replace("_image", "_Image", regex=False)

# Save cleaned dataset

In [None]:
df.to_csv('Dataset/cleaned_data.csv', index=False)

Validate the output CSV

In [None]:
df = pd.read_csv('Dataset/cleaned_data.csv')
df

In [None]:
sizes = []
for img_name in df['Path']:
  img_path = os.path.join('Dataset/public/images', img_name)
  with Image.open(img_path) as img:
    sizes.append(img.size) 

size_counts = Counter(sizes)
print(size_counts)

There are 566 images with a size of 640 $\times$ 480