In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import Sequence
from PIL import Image
import os
import re

In [2]:
IMAGE_HEIGHT = 150
IMAGE_WIDTH = 100
CHANNELS = 3
BATCH_SIZE = 32
RANDOM_STATE = 42
EPOCHS = 15 # Can adjust this, EarlyStopping will handle the stop

In [3]:
from google.colab import drive
drive.mount('/content/drive')

POSTER_DIR = '/content/drive/MyDrive/movie_genre_project/data/posters/'
CSV_PATH = '/content/drive/MyDrive/movie_genre_project/data/imdb-movies-dataset.csv'


Mounted at /content/drive


In [14]:
# --- 1. Data Loading and Single-Label Integer Encoding ---
print("--- Starting Single-Label Data Preprocessing (Sparse) ---")
try:
    # Use the existing CSV path from your Colab setup
    df = pd.read_csv(CSV_PATH)
except FileNotFoundError as e:
    print(f"Error: CSV file not found at {CSV_PATH}. Check your Google Drive mount.")
    raise e

df = df.dropna(subset=['Genre', 'Title']).reset_index(drop=True)

# 1.1 Extract ONLY the first genre
df['First_Genre'] = df['Genre'].apply(lambda x: x.split(',')[0].strip())

# FIX 1a: Filter out rare classes (< 2 samples) for stable stratified splitting
genre_counts = df['First_Genre'].value_counts()
rare_genres = genre_counts[genre_counts < 2].index
df_filtered = df[~df['First_Genre'].isin(rare_genres)].reset_index(drop=True)
df = df_filtered

# 1.2 Integer Encoding (Sparse Format)
unique_genres = sorted(df['First_Genre'].unique())
genre_to_index = {genre: i for i, genre in enumerate(unique_genres)}
num_genres = len(unique_genres)

# *** FIX FOR KEY ERROR ***
# The program failed because these two lines were never run, or were run out of order.
# This line creates the 'Encoded_Genre_Index' column.
df['Encoded_Genre_Index'] = df['First_Genre'].apply(lambda x: genre_to_index[x])
# *************************

print(f"Total unique *first* genres after filtering: {num_genres}")

# 1.3 Create Image File Paths
def get_file_name(title):
    cleaned_title = re.sub(r'[^\w\s-]', '', title).strip().replace(' ', '_')
    return f"{cleaned_title}.jpg"

df['File_Path'] = df['Title'].apply(lambda x: os.path.join(POSTER_DIR, get_file_name(x)))


--- Starting Single-Label Data Preprocessing (Sparse) ---
Total unique *first* genres after filtering: 20


In [15]:
# --- 2. Train-Validation-Test Split ---
X = df['File_Path'].values
# y is the new INTEGER INDEX vector, accessed directly from the DataFrame
y = df['Encoded_Genre_Index'].values

# Stratified split uses the integer index directly
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True, stratify=y)
# FIX: Removed 'stratify' from the second split to avoid the size-1 error
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Testing samples: {len(X_test)}")




Training samples: 7992
Validation samples: 999
Testing samples: 1000


In [24]:
# --- 3. Custom Keras Data Generator (FINAL FIX: RANK 1 LABELS) ---
class MoviePosterDataGenerator(Sequence):
    def __init__(self, x_set, y_set, batch_size, img_width, img_height, shuffle=True):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size
        self.img_width = img_width
        self.img_height = img_height
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.x) / self.batch_size))

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.x))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        list_X = [self.x[k] for k in indexes]
        list_y = [self.y[k] for k in indexes]
        X, y = self.__data_generation(list_X, list_y)
        return X, y

    def __data_generation(self, list_X, list_y):
        X = np.empty((self.batch_size, self.img_height, self.img_width, CHANNELS), dtype=np.float32)

        # FINAL FIX: Use Rank 1 array for sparse labels (shape: (batch_size,))
        y = np.empty((self.batch_size,), dtype=np.int32)

        for i, (path, label) in enumerate(zip(list_X, list_y)):
            try:
                img = Image.open(path).convert('RGB')
                img = img.resize((self.img_width, self.img_height))
                X[i,] = np.array(img) / 255.0  # Normalize

                # Assign the integer label directly to the 1D array index
                y[i] = label
            except Exception as e:
                # Handle missing/corrupt files
                X[i,] = np.zeros((self.img_height, self.img_width, CHANNELS), dtype=np.float32)
                y[i] = 0 # Assign a safe default index (0)

        return X, y

train_generator = MoviePosterDataGenerator(X_train, y_train, BATCH_SIZE, IMAGE_WIDTH, IMAGE_HEIGHT, shuffle=True)
validation_generator = MoviePosterDataGenerator(X_val, y_val, BATCH_SIZE, IMAGE_WIDTH, IMAGE_HEIGHT, shuffle=False)
test_generator = MoviePosterDataGenerator(X_test, y_test, BATCH_SIZE, IMAGE_WIDTH, IMAGE_HEIGHT, shuffle=False)



In [25]:
# --- 4. Custom CNN Model Definition (Softmax Output) ---
def build_custom_cnn_single_label(input_shape, num_classes):
    img_input = Input(shape=input_shape)
    x = Conv2D(32, (3, 3), padding='same', activation='relu')(img_input)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)

    x = Conv2D(64, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)

    x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)

    x = Flatten()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)

    # Softmax output remains correct for single-label
    output = Dense(num_classes, activation='softmax')(x)
    return Model(img_input, output, name='custom_single_label_cnn')

model = build_custom_cnn_single_label((IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS), num_genres)


In [26]:
# --- 5. Compile the Model (Sparse Categorical Crossentropy) ---
model.compile(optimizer=Adam(learning_rate=0.001),
              # Use Sparse Categorical Crossentropy for integer labels
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

print("\nSingle-Label Custom CNN Model Summary:")
model.summary()




Single-Label Custom CNN Model Summary:


In [27]:
# --- 6. Train and Evaluate ---
print("\n--- Starting Model Training ---")
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('best_single_genre_model.h5', monitor='val_loss', save_best_only=True)
]

history = model.fit(
    train_generator,
    epochs=EPOCHS,
    validation_data=validation_generator,
    callbacks=callbacks,
    verbose=1
)

print("\nTraining complete. Best model saved as 'best_single_genre_model.h5'.")

print("\n--- Evaluating Model on Test Set ---")
loss, accuracy = model.evaluate(test_generator, verbose=0)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")



--- Starting Model Training ---
Epoch 1/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5144 - loss: 1.7640



[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m614s[0m 2s/step - accuracy: 0.5144 - loss: 1.7625 - val_accuracy: 0.5817 - val_loss: 1.2684
Epoch 2/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.5497 - loss: 1.2342



[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 116ms/step - accuracy: 0.5497 - loss: 1.2342 - val_accuracy: 0.5887 - val_loss: 1.2301
Epoch 3/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 0.5719 - loss: 1.2041



[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 114ms/step - accuracy: 0.5719 - loss: 1.2041 - val_accuracy: 0.5907 - val_loss: 1.2301
Epoch 4/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 107ms/step - accuracy: 0.5671 - loss: 1.2109 - val_accuracy: 0.5806 - val_loss: 1.2338
Epoch 5/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.5860 - loss: 1.1644



[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 110ms/step - accuracy: 0.5860 - loss: 1.1645 - val_accuracy: 0.5857 - val_loss: 1.2027
Epoch 6/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 109ms/step - accuracy: 0.5814 - loss: 1.1519 - val_accuracy: 0.5968 - val_loss: 1.2262
Epoch 7/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 106ms/step - accuracy: 0.6251 - loss: 1.0516 - val_accuracy: 0.5917 - val_loss: 1.2044
Epoch 8/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 107ms/step - accuracy: 0.6482 - loss: 0.9882 - val_accuracy: 0.5837 - val_loss: 1.2310
Epoch 9/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 108ms/step - accuracy: 0.6938 - loss: 0.8866 - val_accuracy: 0.5696 - val_loss: 1.2948
Epoch 10/15
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 107ms/step - accuracy: 0.7186 -