In [1]:
import os
import pandas as pd
import requests
import gzip
import shutil
from datetime import datetime

In [2]:
# ========= CONFIG ========= 

BASE_URL = "https://datasets.imdbws.com/"
DATASET_FILES = [
    "name.basics.tsv.gz",
    "title.basics.tsv.gz",
    "title.principals.tsv.gz",
    "title.ratings.tsv.gz"
]
SOURCE_SYSTEM = 'imdb'
RAW_DATA_DIR = os.path.join('..', '..', 'data', 'raw', 'imdb')
os.makedirs(RAW_DATA_DIR, exist_ok=True)

TSV_SEPARATOR = '\t'
TSV_DECIMAL_SIGN = '.'
TSV_NA_VALUES = r'\N'
CHUNKSIZE = 10**5 # For reading large TSV files in chunks

In [3]:
# ========= HELP FUNCTION  ========= 

def download_file(url, destination_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

        with open(destination_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192): # Increased chunk size for efficiency
                file.write(chunk)
        return True
    except requests.exceptions.RequestException as e:
        return False

def unzip_gz_file(gz_path, output_path):
    try:
        with gzip.open(gz_path, 'rb') as f_in:
            with open(output_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        return True
    except Exception as e:
        return False

def rename_imdb_tsv(file_path, source_system):
    file_name = os.path.basename(file_path)
    if not file_name.endswith('.tsv'):
        return None # Only process .tsv files

    parts = file_name.split('.')
    if len(parts) >= 2:
        new_file_name = f"{source_system}_{parts[0]}_{parts[1]}.tsv"
        new_path = os.path.join(os.path.dirname(file_path), new_file_name)
        try:
            os.rename(file_path, new_path)
            return new_path
        except OSError as e:
            return None
    return None

def read_filtered_tsv_in_chunks(filepath, filter_col, filter_values,
                                 selected_cols=None, new_col_names=None):
    df_collected = pd.DataFrame()
    for chunk in pd.read_csv(filepath, sep=TSV_SEPARATOR, decimal=TSV_DECIMAL_SIGN,
                             dtype=str, na_values=TSV_NA_VALUES, chunksize=CHUNKSIZE):
        filtered_chunk = chunk[chunk[filter_col].isin(filter_values)]
        df_collected = pd.concat([df_collected, filtered_chunk], ignore_index=True)

    if selected_cols and new_col_names:
        columns_dict = dict(zip(selected_cols, new_col_names))
        df_collected = df_collected[selected_cols].rename(columns=columns_dict)

    return df_collected.reset_index(drop=True)

In [4]:
# ========= DOWNLOAD & PREPARE RAW DATA ========= 

# --- Download, Unzip, and Rename Files ---
for dataset in DATASET_FILES:
    file_url = BASE_URL + dataset
    gz_destination_path = os.path.join(RAW_DATA_DIR, dataset)
    tsv_output_path = os.path.join(RAW_DATA_DIR, os.path.splitext(dataset)[0]) # path before renaming

    if download_file(file_url, gz_destination_path):
        if unzip_gz_file(gz_destination_path, tsv_output_path):
            os.remove(gz_destination_path) # Clean up .gz file after unzipping
            rename_imdb_tsv(tsv_output_path, SOURCE_SYSTEM)

print("--- Finished Data Download and Preparation ---")

--- Finished Data Download and Preparation ---


In [5]:
# ========= PREPARE MOVIE ID LIST ========= 

file_title_basics_renamed = os.path.join(RAW_DATA_DIR, f'{SOURCE_SYSTEM}_title_basics.tsv')

# Load and filter title basics to get the relevant movie IDs
df_basics_raw = pd.read_csv(
    file_title_basics_renamed,
    sep=TSV_SEPARATOR,
    decimal=TSV_DECIMAL_SIGN,
    dtype=str,
    na_values=TSV_NA_VALUES
)

df_filtered_movies = df_basics_raw[
    (df_basics_raw['titleType'] == 'movie') &
    (df_basics_raw['isAdult'] == '0') &
    (~df_basics_raw['genres'].isna())
].copy()

# Convert numerical columns and handle errors
df_filtered_movies['startYear'] = pd.to_numeric(df_filtered_movies['startYear'], errors='coerce').fillna(0).astype(int)
df_filtered_movies['runtimeMinutes'] = pd.to_numeric(df_filtered_movies['runtimeMinutes'], errors='coerce').fillna(0).astype(int)

# Apply additional filtering criteria
current_year = datetime.now().year
df_filtered_movies = df_filtered_movies[
    (df_filtered_movies['startYear'] >= 2000) &
    (df_filtered_movies['runtimeMinutes'] >= 90) &
    (df_filtered_movies['startYear'] < current_year)
].copy()

# Extract IMDb IDs
df_imdb_movie_ids = df_filtered_movies[['tconst']].rename(columns={'tconst': 'imdb_movie_id'})
df_imdb_movie_ids = df_imdb_movie_ids.sort_values(by='imdb_movie_id').reset_index(drop=True)

# Save the list of relevant IMDb movie IDs
imdb_ids_path = os.path.join(RAW_DATA_DIR, f'{SOURCE_SYSTEM}_movie_ids.csv')
df_imdb_movie_ids.to_csv(imdb_ids_path, index=False)
imdb_ids = df_imdb_movie_ids['imdb_movie_id'].tolist() # Convert to list for efficient 'isin' checks

print("--- Finished Preparing Movie ID List ---")

--- Finished Preparing Movie ID List ---


In [6]:
# =========  TRANSFORM DATASET FILES ========= 

# --- Transform title.basics.tsv ---
selected_cols_basics = ['tconst', 'originalTitle', 'startYear', 'runtimeMinutes', 'genres']
new_col_names_basics = ['movie_imdb_id', 'movie_title', 'movie_release_year', 'movie_duration_minutes', 'movie_categories']
df_movie_titles = read_filtered_tsv_in_chunks(
    file_title_basics_renamed,
    filter_col='tconst',
    filter_values=imdb_ids,
    selected_cols=selected_cols_basics,
    new_col_names=new_col_names_basics
)
df_movie_titles.to_csv(os.path.join(RAW_DATA_DIR, f"{SOURCE_SYSTEM}_movie_titles.csv"), index=False)

# --- Transform title.ratings.tsv ---
file_title_ratings_renamed = os.path.join(RAW_DATA_DIR, f'{SOURCE_SYSTEM}_title_ratings.tsv')
selected_cols_ratings = ['tconst', 'averageRating', 'numVotes']
new_col_names_ratings = ['movie_imdb_id', 'movie_imdb_rating', 'movie_imdb_nof_votes']
df_title_ratings = read_filtered_tsv_in_chunks(
    file_title_ratings_renamed,
    filter_col='tconst',
    filter_values=imdb_ids,
    selected_cols=selected_cols_ratings,
    new_col_names=new_col_names_ratings
)
df_title_ratings.to_csv(os.path.join(RAW_DATA_DIR, f"{SOURCE_SYSTEM}_movie_ratings.csv"), index=False)

# --- Transform title.principals.tsv ---
file_title_principals_renamed = os.path.join(RAW_DATA_DIR, f'{SOURCE_SYSTEM}_title_principals.tsv')
title_principal_categories = ['director', 'actor', 'actress', 'writer', 'producer']

df_t_principals_raw = pd.DataFrame()
for chunk in pd.read_csv(file_title_principals_renamed, sep=TSV_SEPARATOR, decimal=TSV_DECIMAL_SIGN,
                         dtype=str, na_values=TSV_NA_VALUES, chunksize=CHUNKSIZE):
    # Filter by both tconst and category in one go
    filtered_chunk = chunk[(chunk['tconst'].isin(imdb_ids)) & (chunk['category'].isin(title_principal_categories))]
    df_t_principals_raw = pd.concat([df_t_principals_raw, filtered_chunk], ignore_index=True)

selected_cols_principals = ['tconst', 'nconst', 'category']
new_col_names_principals = ['movie_imdb_id', 'movie_person_name_id', 'movie_person_role']
df_title_principals = df_t_principals_raw[selected_cols_principals].rename(columns=dict(zip(selected_cols_principals, new_col_names_principals))).reset_index(drop=True)

# Normalize 'actress' role to 'actor'
df_title_principals['movie_person_role'] = df_title_principals['movie_person_role'].replace({'actress': 'actor'})
df_title_principals.to_csv(os.path.join(RAW_DATA_DIR, f"{SOURCE_SYSTEM}_movie_principals.csv"), index=False)

# Create a list of unique movie person name IDs for filtering name.basics.tsv
movie_person_name_ids = df_title_principals['movie_person_name_id'].unique().tolist()

# --- Transform name.basics.tsv (Movie Persons) ---
file_name_basics_renamed = os.path.join(RAW_DATA_DIR, f'{SOURCE_SYSTEM}_name_basics.tsv')
selected_cols_persons = ['nconst', 'primaryName', 'birthYear', 'deathYear']
new_col_names_persons = ['movie_person_name_id', 'movie_person_name', 'movie_person_birth_year', 'movie_person_death_year']

df_movie_persons = read_filtered_tsv_in_chunks(
    file_name_basics_renamed,
    filter_col='nconst',
    filter_values=movie_person_name_ids,
    selected_cols=selected_cols_persons,
    new_col_names=new_col_names_persons
)
df_movie_persons = df_movie_persons.sort_values('movie_person_name_id').reset_index(drop=True)
df_movie_persons.to_csv(os.path.join(RAW_DATA_DIR, f"{SOURCE_SYSTEM}_movie_persons.csv"), index=False)

print("--- All Dataset Transformations Complete ---")

--- All Dataset Transformations Complete ---
