In [1]:
import os
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split

In [2]:
original_train_dir = 'multi_class/train'
undersampled_train_dir = 'multi_class_undersampling/train'

In [3]:
os.makedirs(undersampled_train_dir, exist_ok=True)

In [4]:
target_class = 'Ductal Carcinoma'
target_count = 710 #Same as the 2nd highest

In [5]:
for cls in os.listdir(original_train_dir):
    src_folder = os.path.join(original_train_dir, cls)
    dst_folder = os.path.join(undersampled_train_dir, cls)
    os.makedirs(dst_folder, exist_ok=True)

    images = os.listdir(src_folder)
    random.shuffle(images)

    if cls == target_class:
        # Keep only 710 images
        selected = images[:target_count]
    else:
        # Copy all images for other classes
        selected = images

    for img in selected:
        shutil.copy(os.path.join(src_folder, img),
                    os.path.join(dst_folder, img))

    print(f"{cls}: kept {len(selected)} images")

print("✅ Undersampled training set created at:", undersampled_train_dir)

Adenosis: kept 311 images
Ductal Carcinoma: kept 710 images
Fibroadenoma: kept 710 images
Lobular Carcinoma: kept 438 images
Mucinous Carcinoma: kept 554 images
Papillary Carcinoma: kept 392 images
Phyllodes Tumor: kept 317 images
Tubular Adenoma: kept 398 images
✅ Undersampled training set created at: multi_class_undersampling/train


In [15]:
original_root = r"..\Multi_Class_Classification\multi_class"
new_root = r"..\Multi_Class_Classification\multi_class_undersampling"

for subset in ["validation", "test"]:
    src = os.path.join(original_root, subset)
    dst = os.path.join(new_root, subset)
    
    if not os.path.exists(src):
        print(f"❌ Source folder not found: {src}")
        continue
    
    if os.path.exists(dst):
        print(f"⚠️ Destination folder already exists, skipping: {dst}")
    else:
        shutil.copytree(src, dst)
        print(f"✅ Copied {subset} folder to {dst}")

✅ Copied validation folder to ..\Multi_Class_Classification\multi_class_undersampling\validation
✅ Copied test folder to ..\Multi_Class_Classification\multi_class_undersampling\test


In [16]:
root = r"..\Multi_Class_Classification\multi_class_undersampling"

for subset in ["train", "validation", "test"]:
    subset_path = os.path.join(root, subset)
    print(f"\n{subset.upper()}:")

    for cls in os.listdir(subset_path):
        cls_path = os.path.join(subset_path, cls)
        if os.path.isdir(cls_path):
            num_images = len(os.listdir(cls_path))
            print(f"  {cls}: {num_images} images")


TRAIN:
  Adenosis: 311 images
  Ductal Carcinoma: 710 images
  Fibroadenoma: 710 images
  Lobular Carcinoma: 438 images
  Mucinous Carcinoma: 554 images
  Papillary Carcinoma: 392 images
  Phyllodes Tumor: 317 images
  Tubular Adenoma: 398 images

VALIDATION:
  Adenosis: 67 images
  Ductal Carcinoma: 517 images
  Fibroadenoma: 152 images
  Lobular Carcinoma: 94 images
  Mucinous Carcinoma: 119 images
  Papillary Carcinoma: 84 images
  Phyllodes Tumor: 68 images
  Tubular Adenoma: 85 images

TEST:
  Adenosis: 66 images
  Ductal Carcinoma: 518 images
  Fibroadenoma: 152 images
  Lobular Carcinoma: 94 images
  Mucinous Carcinoma: 119 images
  Papillary Carcinoma: 84 images
  Phyllodes Tumor: 68 images
  Tubular Adenoma: 86 images
