In [None]:
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import numpy as np


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
BASE_DIR = "/content/drive/MyDrive/movie_genre_project"

DATA_DIR = os.path.join(BASE_DIR, "data")
IMG_DIR = os.path.join(DATA_DIR, "posters")

os.makedirs(IMG_DIR, exist_ok=True)


In [None]:
CSV_PATH = os.path.join(DATA_DIR, "imdb-movies-dataset.csv")
cleaned_csv_path = os.path.join(DATA_DIR, "cleaned_dataset.csv")


In [None]:
df = pd.read_csv(CSV_PATH)
print("Original dataset shape:", df.shape)


Original dataset shape: (10000, 15)


In [None]:
required_cols = ['Title', 'Genre', 'Poster']
df = df[required_cols]
df = df.dropna(subset=['Poster', 'Genre'])
print("After dropping null links/genres:", df.shape)


After dropping null links/genres: (9993, 3)


clean genre column

In [None]:
df['Genre'] = df['Genre'].apply(lambda x: x.split('|')[0].strip())

# Normalize genre names (optional)
df['Genre'] = df['Genre'].str.title()

print("Sample genres:", df['Genre'].unique()[:10])



Sample genres: ['Comedy, Drama, Romance' 'Action, Adventure, Sci-Fi'
 'Biography, Comedy, History' 'Action, Comedy, Drama'
 'Drama, Romance, Sport' 'Horror, Thriller' 'Action, Adventure, Thriller'
 'Comedy, Romance' 'Action, Drama, War' 'Action, Adventure, Drama']


Validate and download posters

In [None]:
def is_valid_image(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return False
        Image.open(BytesIO(response.content))  # validate if it's an image
        return True
    except Exception:
        return False

# Test a few links
print("Testing first poster link:", df['Poster'].iloc[0])
print("Valid:", is_valid_image(df['Poster'].iloc[0]))

# Filter valid links
valid_links = []
for url in tqdm(df['Poster'], desc="Validating poster links"):
    valid_links.append(is_valid_image(url))

df['Valid_Link'] = valid_links
df = df[df['Valid_Link'] == True]
print("After removing invalid links:", df.shape)


Testing first poster link: https://m.media-amazon.com/images/M/MV5BZGI4NTEwNTAtZDcwMi00MDkxLTg1OGYtNTZmMzE3ZDljNzVlXkEyXkFqcGdeQXVyMTEyMjM2NDc2._V1_UY209_CR0,0,140,209_AL_.jpg
Valid: True


Validating poster links: 100%|██████████| 7075/7075 [32:54<00:00,  3.58it/s]

After removing invalid links: (7071, 4)





In [None]:
def download_and_process_image(row):
    url = row['Poster']
    title = row['Title']
    genre = row['Genre']
    filename = f"{title.replace('/', '').replace(' ', '_')}.jpg"
    filepath = os.path.join(IMG_DIR, filename)

    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img = img.resize((224, 224))  # standard size for ResNet
        img.save(filepath)
        return filepath
    except Exception:
        return None

# Download all images
paths = []
for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="Downloading posters"):
    path = download_and_process_image(row)
    paths.append(path)

df['poster_path'] = paths
df = df.dropna(subset=['poster_path'])
print("Images downloaded:", len(df))



Downloading posters: 100%|██████████| 7071/7071 [28:17<00:00,  4.16it/s]

Images downloaded: 7071





In [None]:
# Clean the Genre column BEFORE mapping
df['Genre'] = df['Genre'].astype(str).apply(lambda x: x.split(',')[0].strip()) # Convert to string, split by comma, and take only the first (predominant) genre

df['Genre'] = df['Genre'].str.title()

# Verify cleaning
print("Unique genres after cleaning:", df['Genre'].unique()[:20])

# Encode genres
genres = sorted(df['Genre'].unique())
genre_to_idx = {g: i for i, g in enumerate(genres)}
idx_to_genre = {i: g for g, i in genre_to_idx.items()}

df['genre_idx'] = df['Genre'].map(genre_to_idx)

print("Genre mapping:", genre_to_idx)

Unique genres after cleaning: ['Action' 'Comedy' 'Crime' 'Biography' 'Horror' 'Drama' 'Adventure'
 'Animation' 'Fantasy' 'Thriller' 'Mystery' 'Film-Noir' 'Romance'
 'Documentary' 'Western' 'Sci-Fi' 'War' 'Musical' 'History' 'Family']
Genre mapping: {'Action': 0, 'Adventure': 1, 'Animation': 2, 'Biography': 3, 'Comedy': 4, 'Crime': 5, 'Documentary': 6, 'Drama': 7, 'Family': 8, 'Fantasy': 9, 'Film-Noir': 10, 'History': 11, 'Horror': 12, 'Musical': 13, 'Mystery': 14, 'Romance': 15, 'Sci-Fi': 16, 'Thriller': 17, 'War': 18, 'Western': 19}


In [None]:
df[['Title', 'Genre', 'genre_idx', 'poster_path']].to_csv(cleaned_csv_path, index=False)
print("✅ Cleaned dataset saved to:", cleaned_csv_path)
print("Final shape:", df.shape)

✅ Cleaned dataset saved to: /content/drive/MyDrive/movie_genre_project/data/cleaned_dataset.csv
Final shape: (7071, 6)
