### Dependencies

In [68]:
# File managing
import os, zipfile
import shutil
from pathlib import Path
from collections import Counter

# Data wrangling and manipulation
import pandas as pd
import numpy as np
import random
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# CNN
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Input, Conv2D, BatchNormalization,
                                     MaxPooling2D, Dropout,
                                     GlobalAveragePooling2D, Dense)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import mixed_precision
from tensorflow.keras import backend as K

# Evaluation
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import classification_report, confusion_matrix

SEED = 333
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.config.optimizer.set_jit(True)

In [48]:
print(tf.config.list_physical_devices('GPU') )

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## 01 Data Extraction

### Clean Workspace

In [None]:
folders_to_clear = ['./data/simpsons', './data/simpsons_split', './models']

for folder in folders_to_clear:
    if os.path.exists(folder):
        shutil.rmtree(folder)
        print(f"Deleted: {folder}")
    else:
        print(f"Does not exist: {folder}")


Deleted: ./data/simpsons
Deleted: ./data/simpsons_split
Does not exist: ./data/simpsons_top_18
Deleted: ./models


### Extract Zip

In [50]:
ZIP_PATH  = Path("data/simpsons.zip")
ROOT_DATA_PATH = Path("data/")
DATA_PATH = Path("data/simpsons/")

# Unzip 
if not ROOT_DATA_PATH.exists():
    ROOT_DATA_PATH.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(ROOT_DATA_PATH)

### Class Distribution

In [51]:
# read filenames 
filenames = [f for f in os.listdir(DATA_PATH) if f.endswith(".jpg")]

# extract character names before _pic
classes = [filename.split("_pic")[0] for filename in filenames]

# count class distribution
class_counts = Counter(classes)

# print the distribution
for cl, count in class_counts.most_common():
    print(f"{cl}: {count}")

# only get the top 18 classes
top_classes = {cl for cl, _ in class_counts.most_common(18)}
print("\nTop 18 classes:", top_classes)

homer_simpson: 2246
ned_flanders: 1454
moe_szyslak: 1452
lisa_simpson: 1354
bart_simpson: 1342
marge_simpson: 1291
krusty_the_clown: 1206
principal_skinner: 1194
charles_montgomery_burns: 1193
milhouse_van_houten: 1079
chief_wiggum: 986
abraham_grampa_simpson: 913
sideshow_bob: 877
apu_nahasapeemapetilon: 623
kent_brockman: 498
comic_book_guy: 469
edna_krabappel: 457
nelson_muntz: 358
lenny_leonard: 310
mayor_quimby: 246
waylon_smithers: 181
maggie_simpson: 128
groundskeeper_willie: 121
barney_gumble: 106
selma_bouvier: 103
carl_carlson: 98
ralph_wiggum: 89
patty_bouvier: 72
martin_prince: 71
professor_john_frink: 65
snake_jailbird: 55
cletus_spuckler: 47
rainier_wolfcastle: 45
agnes_skinner: 42
sideshow_mel: 40
otto_mann: 32
fat_tony: 27
gil: 27
miss_hoover: 17
disco_stu: 8
troy_mcclure: 8
lionel_hutz: 3

Top 18 classes: {'apu_nahasapeemapetilon', 'principal_skinner', 'sideshow_bob', 'marge_simpson', 'lisa_simpson', 'moe_szyslak', 'kent_brockman', 'homer_simpson', 'edna_krabappel', 'm

In [52]:
image_count = len([
    f for f in os.listdir(DATA_PATH)
    if f.lower().endswith((".jpg"))
])

print(f"Total of images in {DATA_PATH}: {image_count}")

Total of images in data/simpsons: 20933


## 02 Data Preparation

### Train/Val/Test Split

In [53]:
SPLIT_PATH = ROOT_DATA_PATH / "simpsons_split"
TRAIN_DIR  = SPLIT_PATH / "train"
VAL_DIR    = SPLIT_PATH / "val"
TEST_DIR   = SPLIT_PATH / "test"

random.seed(333)

for split_dir in (TRAIN_DIR, VAL_DIR, TEST_DIR):
    for cl in top_classes:
        (split_dir / cl).mkdir(parents=True, exist_ok=True)

if not any((TRAIN_DIR / next(iter(top_classes))).iterdir()):
    for cl in top_classes:
        cl_files = sorted([f for f in filenames if f.startswith(f"{cl}_pic")])
        random.shuffle(cl_files)

        n_total  = len(cl_files)
        n_train  = int(0.70 * n_total)
        n_val    = int(0.20 * n_total)

        for i, src_name in enumerate(cl_files):
            src = DATA_PATH / src_name
            if i < n_train:
                dst = TRAIN_DIR / cl / src_name
            elif i < n_train + n_val:
                dst = VAL_DIR / cl / src_name
            else:
                dst = TEST_DIR / cl / src_name
            shutil.copy2(src, dst)

def count_per_split(split_dir):
    return {cls: len(list((split_dir/cls).iterdir()))
            for cls in sorted(top_classes)}

print("Train counts:", count_per_split(TRAIN_DIR))
print("Val counts:",   count_per_split(VAL_DIR))
print("Test counts:",  count_per_split(TEST_DIR))


Train counts: {'abraham_grampa_simpson': 639, 'apu_nahasapeemapetilon': 436, 'bart_simpson': 939, 'charles_montgomery_burns': 835, 'chief_wiggum': 690, 'comic_book_guy': 328, 'edna_krabappel': 319, 'homer_simpson': 1572, 'kent_brockman': 348, 'krusty_the_clown': 844, 'lisa_simpson': 947, 'marge_simpson': 903, 'milhouse_van_houten': 755, 'moe_szyslak': 1016, 'ned_flanders': 1017, 'nelson_muntz': 250, 'principal_skinner': 835, 'sideshow_bob': 613}
Val counts: {'abraham_grampa_simpson': 182, 'apu_nahasapeemapetilon': 124, 'bart_simpson': 268, 'charles_montgomery_burns': 238, 'chief_wiggum': 197, 'comic_book_guy': 93, 'edna_krabappel': 91, 'homer_simpson': 449, 'kent_brockman': 99, 'krusty_the_clown': 241, 'lisa_simpson': 270, 'marge_simpson': 258, 'milhouse_van_houten': 215, 'moe_szyslak': 290, 'ned_flanders': 290, 'nelson_muntz': 71, 'principal_skinner': 238, 'sideshow_bob': 175}
Test counts: {'abraham_grampa_simpson': 92, 'apu_nahasapeemapetilon': 63, 'bart_simpson': 135, 'charles_montg

### ImageDataGenerators

In [54]:
TARGET_SIZE = (128, 192)
BATCH_SIZE  = 96
mixed_precision.set_global_policy("mixed_float16")


dgen_train = ImageDataGenerator(
    rescale=1./255,
    zoom_range=0.2,
    horizontal_flip=True
)
dgen_val_test = ImageDataGenerator(rescale=1./255)

train_generator = dgen_train.flow_from_directory(
    TRAIN_DIR,
    target_size=TARGET_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    shuffle=True,
    seed=SEED
)

validation_generator = dgen_val_test.flow_from_directory(
    VAL_DIR,
    target_size=TARGET_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    shuffle=False
)

test_generator = dgen_val_test.flow_from_directory(
    TEST_DIR,
    target_size=TARGET_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    shuffle=False
)

Found 13286 images belonging to 18 classes.
Found 3789 images belonging to 18 classes.
Found 1917 images belonging to 18 classes.


### Calcuate Class Weights

In [55]:
train_counts = np.array(list(count_per_split(TRAIN_DIR).values()))
class_indices = train_generator.class_indices 

weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(len(class_indices)),
    y=np.repeat(
        list(class_indices.values()),
        train_counts
    )
)
class_weight = dict(enumerate(weights))
print(class_weight)

{0: np.float64(1.1551034602677794), 1: np.float64(1.6929153924566769), 2: np.float64(0.7860608212045912), 3: np.float64(0.8839654025282768), 4: np.float64(1.0697262479871175), 5: np.float64(2.250338753387534), 6: np.float64(2.31382793451759), 7: np.float64(0.4695363302233531), 8: np.float64(2.121008939974457), 9: np.float64(0.8745392311743022), 10: np.float64(0.7794203918807932), 11: np.float64(0.8173987941429802), 12: np.float64(0.9776306107431936), 13: np.float64(0.7264873140857393), 14: np.float64(0.7257729706107288), 15: np.float64(2.9524444444444446), 16: np.float64(0.8839654025282768), 17: np.float64(1.2040964292187784)}


## 03 Model Architecture

In [56]:
IMG_SIZE = TARGET_SIZE
INPUT_SHAPE  = (*IMG_SIZE, 3)
NUM_CLASSES  = train_generator.num_classes

# define layers
model = Sequential([
    Input(shape=INPUT_SHAPE),

    # First Convolutional Block
    Conv2D(32, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    Conv2D(32, (3, 3), activation='relu', padding='same'),
    MaxPooling2D(),
    Dropout(0.25),

    # Second Convolutional Block
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    MaxPooling2D(),
    Dropout(0.25),

    # Third  Convolutional Block
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    MaxPooling2D(),
    Dropout(0.32),

    #  Dense classifier layer
    GlobalAveragePooling2D(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(NUM_CLASSES, activation='softmax', dtype='float32')  # evita underflow si usas mixed precision
])

# optimizer and loss function setup
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# define training callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=20,
        restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        filepath= 'models/best_model_{epoch:02d}-{val_accuracy:.3f}.keras',
        save_best_only=True,
        monitor='val_loss',
        verbose=1
    )
]

model.summary()

## 04 CNN training

In [57]:

history = model.fit(
    train_generator,
    epochs=100,
    validation_data=validation_generator,
    class_weight=class_weight,
    callbacks=callbacks
)


Your `PyDataset` class should call `super().__init__(**kwargs)` in its constructor. `**kwargs` can include `workers`, `use_multiprocessing`, `max_queue_size`. Do not pass these arguments to `fit()`, as they will be ignored.



Epoch 1/100
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step - accuracy: 0.1358 - loss: 2.7740
Epoch 1: val_loss improved from inf to 3.03311, saving model to models/best_model_01-0.062.keras
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 219ms/step - accuracy: 0.1361 - loss: 2.7726 - val_accuracy: 0.0623 - val_loss: 3.0331
Epoch 2/100
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step - accuracy: 0.3158 - loss: 2.1275
Epoch 2: val_loss did not improve from 3.03311
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 191ms/step - accuracy: 0.3161 - loss: 2.1264 - val_accuracy: 0.0631 - val_loss: 3.5123
Epoch 3/100
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step - accuracy: 0.4718 - loss: 1.5992
Epoch 3: val_loss improved from 3.03311 to 2.39413, saving model to models/best_model_03-0.278.keras
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 191ms/step 

In [64]:
model.save("best_model.keras")

## 05 Model Evaluation

In [69]:
y_true       = test_generator.classes
y_pred_prob  = model.predict(test_generator, verbose=0)
y_pred       = np.argmax(y_pred_prob, axis=1)

# Classification Report
class_names = list(test_generator.class_indices.keys())
print(classification_report(
    y_true, y_pred, target_names=class_names, digits=2))


                          precision    recall  f1-score   support

  abraham_grampa_simpson       0.92      0.96      0.94        92
  apu_nahasapeemapetilon       0.98      0.97      0.98        63
            bart_simpson       0.96      0.92      0.94       135
charles_montgomery_burns       0.93      0.95      0.94       120
            chief_wiggum       0.94      0.98      0.96        99
          comic_book_guy       0.90      0.96      0.93        48
          edna_krabappel       0.98      0.96      0.97        47
           homer_simpson       0.95      0.94      0.95       225
           kent_brockman       0.94      1.00      0.97        51
        krusty_the_clown       0.95      0.95      0.95       121
            lisa_simpson       0.96      0.95      0.95       137
           marge_simpson       0.96      0.95      0.96       130
     milhouse_van_houten       1.00      0.96      0.98       109
             moe_szyslak       0.95      0.95      0.95       146
         