#### loading libraries

In [107]:
import pandas as pd
import numpy as np
import os
import os
import shutil
import scipy
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from PIL import Image


#### loading data

In [108]:
train_path=r'D:/semestr_10/master_thesis/medical_chatbot/data/images/train'
test_path=r'D:/semestr_10/master_thesis/medical_chatbot/data/images/test'
valid_path=r'D:/semestr_10/master_thesis/medical_chatbot/data/images/val'

list_of_classes=['Healthy', 'Doubtful', 'Minimal', 'Moderate', 'Severe']

In [109]:
for d in [train_path, test_path, valid_path]:
    filepaths = []
    labels=[] 
    classlist=os.listdir(d)   
    for klass in classlist:
        intklass=int(klass)
        label=list_of_classes[intklass]
        classpath=os.path.join(d, klass)
        flist=os.listdir(classpath)        
        for f in flist:
            fpath=os.path.join(classpath,f)
            filepaths.append(fpath)
            labels.append(label)
    Fseries=pd.Series(filepaths, name='filepaths')
    Lseries=pd.Series(labels, name='labels')        
    pdf=pd.concat([Fseries, Lseries], axis=1)
    if d == test_path:
        test_df=pdf
    elif d == valid_path:
        valid_df=pdf
    else:
        train_df=pdf

In [110]:
classes=sorted(list(train_df['labels'].unique()))
class_count = len(classes)
print('The number of classes in the dataset is: ', class_count, "\n")

print(train_df['labels'].value_counts())

The number of classes in the dataset is:  5 

labels
Healthy     2286
Minimal     1516
Doubtful    1046
Moderate     757
Severe       173
Name: count, dtype: int64


Now because of imbalance of amount of data, there is going to be trim process applied here.

In [111]:
def trim_data(df, max_samples, min_samples, column_name, label_samples = 250):
    df=df.copy()
    groups=df.groupby(column_name)    
    trimmed_df = pd.DataFrame(columns = df.columns)
    for label in df[column_name].unique(): 
        group=groups.get_group(label)
        count=len(group)    
        if label in ('Healthy', 'Minimal', 'Doubtful'):
            sampled_group=group.sample(n=label_samples, random_state=123, axis=0)
            trimmed_df=pd.concat([trimmed_df, sampled_group], axis=0)
        else:
            if count > max_samples:
                sampled_group=group.sample(n=max_samples, random_state=123,axis=0)
                trimmed_df=pd.concat([trimmed_df, sampled_group], axis=0)
            else:
                if count>=min_samples:
                    sampled_group=group        
                    trimmed_df=pd.concat([trimmed_df, sampled_group], axis=0)
    return trimmed_df

Short function to show how many samples are in dataset.

In [112]:
def explore_amount(df):
    groups=df.groupby('labels')
    print('{0:^30s} {1:^13s}'.format('CLASS', 'IMAGE COUNT'))
    countlist=[]
    classlist=[]
    for label in sorted(list(df['labels'].unique())):
        group=groups.get_group(label)
        countlist.append(len(group))
        classlist.append(label)
        print('{0:^30s} {1:^13s}'.format(label, str(len(group))))
    print('\n')

### TRAIN

Trimming data.

In [113]:
explore_amount(train_df)

            CLASS               IMAGE COUNT 
           Doubtful                1046     
           Healthy                 2286     
           Minimal                 1516     
           Moderate                 757     
            Severe                  173     




In [114]:
trimmed_train_df = trim_data(train_df, 500, 173, 'labels')

In [115]:
print(trimmed_train_df['labels'].value_counts().sort_index())


labels
Doubtful    250
Healthy     250
Minimal     250
Moderate    500
Severe      173
Name: count, dtype: int64


In [116]:
explore_amount(trimmed_train_df)

            CLASS               IMAGE COUNT 
           Doubtful                 250     
           Healthy                  250     
           Minimal                  250     
           Moderate                 500     
            Severe                  173     




Now we change mapping of classes so that we will force this rule {Healthy:Healthy, Doubtful:Healthy, Minimal:Healthy, Moderate:Moderate, Severege:Severege}. So that two the most advanced severities are being classified while the others are meaningless or healthy.

In [117]:
def relabel(df):
    mapping = {
        'Doubtful': 'Healthy',
        'Minimal': 'Healthy',
        'Healthy': 'Healthy',
        'Moderate': 'Moderate',
        'Severe': 'Severe'
    }
    df['labels'] = df['labels'].map(mapping)
    return df

In [118]:
trimmed_train_df = relabel(trimmed_train_df)

In [119]:
explore_amount(trimmed_train_df)

            CLASS               IMAGE COUNT 
           Healthy                  750     
           Moderate                 500     
            Severe                  173     




In [120]:
classes=sorted(list(trimmed_train_df['labels'].unique()))
class_count = len(classes)
print('The number of classes in the dataset is: ', class_count)

The number of classes in the dataset is:  3


In [121]:
trimmed_train_df.head()

Unnamed: 0,filepaths,labels
749,D:/semestr_10/master_thesis/medical_chatbot/da...,Healthy
691,D:/semestr_10/master_thesis/medical_chatbot/da...,Healthy
289,D:/semestr_10/master_thesis/medical_chatbot/da...,Healthy
1831,D:/semestr_10/master_thesis/medical_chatbot/da...,Healthy
224,D:/semestr_10/master_thesis/medical_chatbot/da...,Healthy


Function to perform data augmentation for "Severe" class since it has lowest number of samples, then it's saved to training folder for further usage.

In [122]:
def prepare_training_folder(df, image_col='filepaths', label_col='labels', 
                            output_dir='training', target_aug_class='Severe', target_count=500):

    os.makedirs(output_dir, exist_ok=True)

    print("Copying original images to training folder...")
    for _, row in tqdm(df.iterrows(), total=len(df)):
        label = row[label_col]
        src_path = row[image_col]
        dest_dir = os.path.join(output_dir, label)
        os.makedirs(dest_dir, exist_ok=True)
        filename = os.path.basename(src_path)
        dest_path = os.path.join(dest_dir, filename)

        if not os.path.exists(dest_path):
            shutil.copy2(src_path, dest_path)

    class_dir = os.path.join(output_dir, target_aug_class)
    current_count = len([f for f in os.listdir(class_dir) if f.lower().endswith(('.jpg', '.png', '.jpeg'))])
    needed = target_count - current_count

    if needed <= 0:
        print(f"No augmentation needed. '{target_aug_class}' already has {current_count} images.")
        return

    print(f"\nAugmenting class '{target_aug_class}' with {needed} more images...")

    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.15,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    target_df = df[df[label_col] == target_aug_class]

    i = 0
    pbar = tqdm(total=needed)
    while i < needed:
        sample = target_df.sample(1).iloc[0]
        img = load_img(sample[image_col], target_size=(224, 224))
        x = img_to_array(img)
        x = np.expand_dims(x, axis=0)

        aug_iter = datagen.flow(x, batch_size=1)
        aug_img = next(aug_iter)[0].astype(np.uint8)

        aug_filename = f"{target_aug_class}_aug_{i}.jpg"
        aug_path = os.path.join(class_dir, aug_filename)

        if not os.path.exists(aug_path):
            Image.fromarray(aug_img).save(aug_path)
            i += 1
            pbar.update(1)

    pbar.close()
    print(f"✅ Training folder prepared at '{output_dir}'")

In [123]:
prepare_training_folder(trimmed_train_df)

Copying original images to training folder...


100%|██████████| 1423/1423 [00:02<00:00, 517.71it/s]



Augmenting class 'Severe' with 327 more images...


100%|██████████| 327/327 [00:08<00:00, 38.38it/s]

✅ Training folder prepared at 'training'





### VALIDATION

In [124]:
explore_amount(valid_df)

            CLASS               IMAGE COUNT 
           Doubtful                 153     
           Healthy                  328     
           Minimal                  212     
           Moderate                 106     
            Severe                  27      




In [126]:
trimmed_valid_df = trim_data(valid_df, 100, 27, 'labels', 35)

In [127]:
explore_amount(trimmed_valid_df)

            CLASS               IMAGE COUNT 
           Doubtful                 35      
           Healthy                  35      
           Minimal                  35      
           Moderate                 100     
            Severe                  27      




In [128]:
trimmed_valid_df = relabel(trimmed_valid_df)

In [129]:
explore_amount(trimmed_valid_df)

            CLASS               IMAGE COUNT 
           Healthy                  105     
           Moderate                 100     
            Severe                  27      




In [130]:
prepare_training_folder(trimmed_valid_df, output_dir='valid', target_count=100)

Copying original images to training folder...


100%|██████████| 232/232 [00:02<00:00, 97.84it/s]



Augmenting class 'Severe' with 73 more images...


100%|██████████| 73/73 [00:00<00:00, 80.81it/s]

✅ Training folder prepared at 'valid'





### TEST

In [125]:
explore_amount(test_df)

            CLASS               IMAGE COUNT 
           Doubtful                 296     
           Healthy                  639     
           Minimal                  447     
           Moderate                 223     
            Severe                  51      




In [133]:
trimmed_test_df = trim_data(test_df, 100, 51, column_name='labels', label_samples=35)

In [134]:
explore_amount(trimmed_test_df)

            CLASS               IMAGE COUNT 
           Doubtful                 35      
           Healthy                  35      
           Minimal                  35      
           Moderate                 100     
            Severe                  51      




In [135]:
trimmed_test_df = relabel(trimmed_test_df)

In [136]:
explore_amount(trimmed_test_df)

            CLASS               IMAGE COUNT 
           Healthy                  105     
           Moderate                 100     
            Severe                  51      




In [138]:
prepare_training_folder(trimmed_test_df, output_dir='test', target_count=100)

Copying original images to training folder...


100%|██████████| 256/256 [00:00<00:00, 704.46it/s]



Augmenting class 'Severe' with 49 more images...


100%|██████████| 49/49 [00:00<00:00, 56.27it/s]

✅ Training folder prepared at 'test'



