<a href="https://colab.research.google.com/github/meenub255/NXP_SEM_DEFECT/blob/main/Synthethic_Generation_for_Dataset_of_SEM_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import shutil
import random
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import torch
import torchvision.transforms as T
import matplotlib.pyplot as plt
# ==========================================
# 1. Configuration
# ==========================================
# Input folder containing class subfolders (e.g., /content/dataset/opens, /content/dataset/bridges)
INPUT_DATASET_PATH = "/content/drive/MyDrive/sem"
# Output folder where augmented images will be saved
OUTPUT_DATASET_PATH = "/content/drive/MyDrive/Hackathon_Images/Augmented_Dataset_10x"
# How many augmented versions to create per original image
# 50 images * 10 = 500 images (Meets Hackathon Req)
AUGMENTATION_FACTOR = 10
# ==========================================
# 2. Augmentation Strategy
# ==========================================
def get_augmentation_pipeline(img_size=224):
    """
    Defines the random transformations to apply.
    """
    return T.Compose([
        T.Resize((img_size + 32, img_size + 32)),  # Resize slightly larger
        T.RandomCrop(img_size),                     # Random crop
        T.RandomHorizontalFlip(p=0.5),              # 50% chance to flip LR
        T.RandomVerticalFlip(p=0.5),                # 50% chance to flip UD (Valid for wafer dies)
        T.RandomRotation(degrees=45),               # Rotate
        T.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.2), # Lighting noise
        T.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)), # Zoom/Shift
        # Note: We do NOT convert to Tensor here because we want to save as Image
    ])
# ==========================================
# 3. Execution Engine
# ==========================================
def run_augmentation():
    # 0. Setup Paths
    input_path = Path(INPUT_DATASET_PATH)
    output_path = Path(OUTPUT_DATASET_PATH)

    if not input_path.exists():
        print(f"Error: Input path '{input_path}' does not exist.")
        print("Please mount Google Drive and check the path.")
        return
    # Create output directory
    if output_path.exists():
        user_input = input(f"Output path '{output_path}' exists. Overwrite? (y/n): ")
        if user_input.lower() == 'y':
            shutil.rmtree(output_path)
        else:
            print("Aborted.")
            return
    output_path.mkdir(parents=True, exist_ok=True)
    # 1. Pipeline
    augmenter = get_augmentation_pipeline(img_size=224)

    # 2. Iterate Classes
    classes = [d for d in input_path.iterdir() if d.is_dir()]
    print(f"Found {len(classes)} classes: {[c.name for c in classes]}")
    for class_dir in tqdm(classes, desc="Processing Classes"):
        class_name = class_dir.name

        # Create corresponding output class folder
        target_dir = output_path / class_name
        target_dir.mkdir(exist_ok=True)

        images = list(class_dir.glob("*.[jJpP][pPnN][gG]")) # Match jpg, png, jpeg
        print(f"  Class '{class_name}': Found {len(images)} images.")

        for img_file in images:
            try:
                # Open Image
                with Image.open(img_file).convert("RGB") as img:

                    # Save Original (Clean copy)
                    save_name = f"{img_file.stem}_orig.jpg"
                    img.resize((224, 224)).save(target_dir / save_name, quality=95)

                    # Generate Augmented Versions
                    for i in range(AUGMENTATION_FACTOR):
                        aug_img = augmenter(img)

                        # Save Augmented
                        save_name = f"{img_file.stem}_aug_{i}.jpg"
                        aug_img.save(target_dir / save_name, quality=90)

            except Exception as e:
                print(f"    Error processing {img_file.name}: {e}")
    print(f"\nSuccess! Augmented dataset saved to: {output_path}")
    print(f"Start Count: ~{sum([len(list(c.glob('*'))) for c in classes])}")
    print(f"Final Count: ~{sum([len(list(c.glob('*'))) for c in output_path.iterdir()])}")
# ==========================================
# 4. Run
# ==========================================
if __name__ == "__main__":
    # Ensure Torchvision is installed
    try:
        import torchvision
        run_augmentation()
    except ImportError:
        print("Installing dependencies...")
        os.system("pip install torchvision tqdm")
        run_augmentation()

Output path '/content/drive/MyDrive/Hackathon_Images/Augmented_Dataset_10x' exists. Overwrite? (y/n): y
Found 5 classes: ['opens', 'cracks', 'bridge', 'vias', 'CMP']


Processing Classes:   0%|          | 0/5 [00:00<?, ?it/s]

  Class 'opens': Found 49 images.


Processing Classes:  20%|██        | 1/5 [00:42<02:48, 42.15s/it]

  Class 'cracks': Found 21 images.


Processing Classes:  40%|████      | 2/5 [01:03<01:30, 30.07s/it]

  Class 'bridge': Found 22 images.


Processing Classes:  60%|██████    | 3/5 [01:24<00:51, 25.71s/it]

  Class 'vias': Found 2 images.


Processing Classes:  80%|████████  | 4/5 [01:25<00:16, 16.10s/it]

  Class 'CMP': Found 70 images.


Processing Classes: 100%|██████████| 5/5 [02:22<00:00, 28.42s/it]


Success! Augmented dataset saved to: /content/drive/MyDrive/Hackathon_Images/Augmented_Dataset_10x
Start Count: ~167
Final Count: ~1804





In [2]:
from pathlib import Path

def count_augmented_images_per_class(output_path_str):
    output_path = Path(output_path_str)
    if not output_path.exists():
        print(f"Error: Output path '{output_path}' does not exist.")
        return

    class_counts = {}
    classes = [d for d in output_path.iterdir() if d.is_dir()]
    print(f"Counting images in {len(classes)} classes:")

    for class_dir in classes:
        class_name = class_dir.name
        # Count all image files (jpg, png, jpeg) in the class directory
        images = list(class_dir.glob("*.[jJpP][pPnN][gG]"))
        class_counts[class_name] = len(images)

    print("\n--- Augmented Image Counts per Class ---")
    for class_name, count in class_counts.items():
        print(f"Class '{class_name}': {count} images")
    print("----------------------------------------")

# Assuming OUTPUT_DATASET_PATH is defined in the previous cells
if 'OUTPUT_DATASET_PATH' in globals():
    count_augmented_images_per_class(OUTPUT_DATASET_PATH)
else:
    print("OUTPUT_DATASET_PATH not defined. Please run the augmentation cell first.")

Counting images in 5 classes:

--- Augmented Image Counts per Class ---
Class 'opens': 539 images
Class 'cracks': 231 images
Class 'bridge': 242 images
Class 'vias': 22 images
Class 'CMP': 770 images
----------------------------------------


In [5]:
# @title Push Code to GitHub from Colab
# Run this cell to push your work to https://github.com/Shashwath-K/nxp_semi_project.git

import os
from getpass import getpass

# 1. Configuration
USER_NAME = "meenub255"
USER_EMAIL = "meenub255@gmail.com" # <--- REPLACE THIS
REPO_NAME = "NXP_SEM_DEFECT"
REPO_URL = f"https://github.com/{USER_NAME}/{REPO_NAME}.git"

# 2. Files to Push (List of files you want to save)
# Assuming you have saved the previous scripts as .py files in Colab
FILES_TO_PUSH = [
    "data_augmentation.py",        # From previous steps
    "colab_training_pipeline.py",  # From previous steps
    "best_student_model.pth"       # If you want to push the model (Warning: Large file!)
]

# 3. Authentication (Securely ask for Token)
print("Please enter your GitHub Personal Access Token (PAT):")
print("(Settings -> Developer settings -> Personal access tokens -> Tokens (classic))")
token = getpass()

# 4. Git Operations
def push_to_github():
    # Configure Git
    !git config --global user.email "{USER_EMAIL}"
    !git config --global user.name "{USER_NAME}"

    # Clone (into a temp folder to avoid messy paths)
    if os.path.exists(REPO_NAME):
        !rm -rf {REPO_NAME}

    print(f"Cloning {REPO_NAME}...")
    !git clone https://{token}@github.com/{USER_NAME}/{REPO_NAME}.git

    # Move files into Repo
    print("Moving files to repo...")
    import shutil
    for file in FILES_TO_PUSH:
        if os.path.exists(file):
            shutil.copy(file, f"{REPO_NAME}/{file}")
            print(f"  Added: {file}")
        else:
            print(f"  Warning: {file} not found in Colab root.")

    # Commit & Push
    os.chdir(REPO_NAME)
    !git add .
    !git commit -m "Add Hackathon training pipeline and data augmentation scripts"
    !git push origin main
    print("\nSuccess! Code pushed to GitHub.")
    os.chdir(".")

push_to_github()

Please enter your GitHub Personal Access Token (PAT):
(Settings -> Developer settings -> Personal access tokens -> Tokens (classic))
··········
Cloning NXP_SEM_DEFECT...
Cloning into 'NXP_SEM_DEFECT'...
Moving files to repo...
On branch main

Initial commit

nothing to commit (create/copy files and use "git add" to track)
error: src refspec main does not match any
[31merror: failed to push some refs to 'https://github.com/meenub255/NXP_SEM_DEFECT.git'
[m
Success! Code pushed to GitHub.
