In [1]:
!pip install -q torch torchvision tqdm scikit-learn pandas matplotlib



In [2]:
import os
import random
import re
import time
import json
from collections import Counter

import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from tqdm.auto import tqdm

In [11]:
BASE_DIR = r"C:\Users\akash\Desktop\Email_Phishing_Detection\openlogo" 

IMG_DIR = os.path.join(BASE_DIR, "JPEGImages")
CLASS_SEP_DIR = os.path.join(BASE_DIR, "ImageSets", "class_sep")
TRAIN_TXT = os.path.join(CLASS_SEP_DIR, "train.txt")
TEST_TXT  = os.path.join(CLASS_SEP_DIR, "test.txt")

assert os.path.isdir(BASE_DIR), f"BASE_DIR not found: {BASE_DIR}"
assert os.path.isdir(IMG_DIR), f"JPEGImages folder not found under {BASE_DIR}"
assert os.path.exists(TRAIN_TXT), f"train.txt not found: {TRAIN_TXT}"
assert os.path.exists(TEST_TXT), f"test.txt not found: {TEST_TXT}"

# Read names (no extension in files)
with open(TRAIN_TXT, 'r') as f:
    train_names = [line.strip() for line in f.readlines() if line.strip()]

with open(TEST_TXT, 'r') as f:
    test_names = [line.strip() for line in f.readlines() if line.strip()]

print(f"Loaded {len(train_names)} train names and {len(test_names)} test names")
print("Example train entries:", train_names[:8])

Loaded 37710 train names and 16456 test names
Example train entries: ['3m10', '3m11', '3m12', '3m13', '3m14', '3m15', '3m16', '3m17']


In [15]:
from sklearn.model_selection import train_test_split

train_names_unique = sorted(set(train_names))  # dedupe image IDs
train_ids, val_ids = train_test_split(train_names_unique, test_size=0.2, random_state=42)

print(f"Train images: {len(train_ids)}, Val images: {len(val_ids)}, Test images: {len(set(test_names))}")

Train images: 18391, Val images: 4598, Test images: 12362


In [16]:
import os
from xml.etree import ElementTree as ET
from PIL import Image
from tqdm import tqdm

ANN_DIR = os.path.join(BASE_DIR, "Annotations")
OUT_DIR = os.path.join(BASE_DIR, "cropped_logos")
os.makedirs(OUT_DIR, exist_ok=True)

def parse_annotation(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    objs = []
    for obj in root.findall("object"):
        name = obj.find("name").text
        bb = obj.find("bndbox")
        xmin = int(bb.find("xmin").text)
        ymin = int(bb.find("ymin").text)
        xmax = int(bb.find("xmax").text)
        ymax = int(bb.find("ymax").text)
        objs.append({"label": name, "bbox": [xmin, ymin, xmax, ymax]})
    return objs

def crop_split(image_ids, split_name):
    split_dir = os.path.join(OUT_DIR, split_name)
    os.makedirs(split_dir, exist_ok=True)
    for img_id in tqdm(image_ids, desc=f"Cropping {split_name}"):
        img_path = os.path.join(IMG_DIR, img_id + ".jpg")
        ann_path = os.path.join(ANN_DIR, img_id + ".xml")
        if not (os.path.exists(img_path) and os.path.exists(ann_path)):
            continue
        img = Image.open(img_path).convert("RGB")
        for i, obj in enumerate(parse_annotation(ann_path)):
            xmin, ymin, xmax, ymax = obj["bbox"]
            # basic safety clamp
            w, h = img.size
            xmin, ymin = max(0, xmin), max(0, ymin)
            xmax, ymax = min(w, xmax), min(h, ymax)
            if xmax <= xmin or ymax <= ymin:
                continue
            crop = img.crop((xmin, ymin, xmax, ymax))
            class_dir = os.path.join(split_dir, obj["label"])
            os.makedirs(class_dir, exist_ok=True)
            crop.save(os.path.join(class_dir, f"{img_id}_{i}.jpg"))

# Run once
crop_split(train_ids, "train")
crop_split(val_ids,   "val")
crop_split(set(test_names), "test")

Cropping train: 100%|██████████| 18391/18391 [26:04<00:00, 11.76it/s]
Cropping val: 100%|██████████| 4598/4598 [07:20<00:00, 10.44it/s]
Cropping test: 100%|██████████| 12362/12362 [12:36<00:00, 16.34it/s]
