In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens/movies.csv
/kaggle/input/movielens/ratings.csv
/kaggle/input/movielens/genome-tags.csv
/kaggle/input/movielens/genome-scores.csv
/kaggle/input/movielens/tags.csv
/kaggle/input/movielens/links.csv


In [5]:
import seaborn as sns
from matplotlib import pyplot as plt

# Movies_Full Dataset (Content-Based Filtering Dataset)
## Movies + Genome_tags + Genome_Scores + Links (impute Year)

In [6]:
movies = pd.read_csv("/kaggle/input/movielens/movies.csv")
ratings = pd.read_csv("/kaggle/input/movielens/ratings.csv")
tags = pd.read_csv("/kaggle/input/movielens/tags.csv")
links = pd.read_csv("/kaggle/input/movielens/links.csv")
genome_tags = pd.read_csv("/kaggle/input/movielens/genome-tags.csv")
genome_scores = pd.read_csv("/kaggle/input/movielens/genome-scores.csv")

In [8]:
for name, df in {
    "Movies": movies, "Ratings": ratings, "Tags": tags,
    "Links": links, "Genome Tags": genome_tags, "Genome Scores": genome_scores
}.items():
    print(f"\n{name} shape: {df.shape}")
    print(df.head(3))



Movies shape: (86537, 3)
   movieId                    title  \
0        1         Toy Story (1995)   
1        2           Jumanji (1995)   
2        3  Grumpier Old Men (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  

Ratings shape: (33832162, 4)
   userId  movieId  rating   timestamp
0       1        1     4.0  1225734739
1       1      110     4.0  1225865086
2       1      158     4.0  1225733503

Tags shape: (2328315, 4)
   userId  movieId            tag   timestamp
0      10      260   good vs evil  1430666558
1      10      260  Harrison Ford  1430666505
2      10      260         sci-fi  1430666538

Links shape: (86537, 3)
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0

Genome Tags shape: (1128, 2)
   tagId           tag
0      1           007
1      2  00

## Check for Null Values

In [9]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [10]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [11]:
tags.isna().sum()

userId        0
movieId       0
tag          17
timestamp     0
dtype: int64

In [12]:
tags = tags.dropna(subset=["tag"])
print("Remaining null tags:", tags["tag"].isna().sum())

Remaining null tags: 0


In [13]:
links.isna().sum()

movieId      0
imdbId       0
tmdbId     126
dtype: int64

In [14]:
genome_tags.isna().sum()

tagId    0
tag      0
dtype: int64

In [15]:
genome_scores.isna().sum()

movieId      0
tagId        0
relevance    0
dtype: int64

## Timestamp to DateTime

In [16]:
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit='s')
tags["timestamp"] = pd.to_datetime(tags["timestamp"], unit='s')

ratings["timestamp"]
tags["timestamp"]

0         2015-05-03 15:22:38
1         2015-05-03 15:21:45
2         2015-05-03 15:22:18
3         2011-07-25 13:32:36
4         2011-07-25 13:32:26
                  ...        
2328310   2017-10-09 11:11:31
2328311   2012-10-26 19:23:04
2328312   2012-10-26 19:23:09
2328313   2006-07-28 18:21:42
2328314   2006-07-30 14:58:45
Name: timestamp, Length: 2328298, dtype: datetime64[ns]

## Year, Month, Day Columns

In [17]:
ratings["year"] = ratings["timestamp"].dt.year
ratings["month"] = ratings["timestamp"].dt.month
ratings["day"] = ratings["timestamp"].dt.day

In [18]:
ratings.head()[["year", "month", "day"]]

Unnamed: 0,year,month,day
0,2008,11,3
1,2008,11,5
2,2008,11,3
3,2008,11,3
4,2008,11,3


## Year From Title

In [19]:
import re
import numpy as np

def extract_year(title):
    if isinstance(title, str):
        match = re.search(r'\((\d{4})', title)
        if match:
            return int(match.group(1))
    return np.nan

movies["year"] = movies["title"].apply(extract_year)

In [20]:
movies.isna().sum()

movieId      0
title        0
genres       0
year       617
dtype: int64

In [21]:
print("Total Missing Years", movies["year"].isna().sum())
movies[movies["year"].isna()].sample(5)


Total Missing Years 617


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,movieId,title,genres,year
68880,221548,On the Rocks,Comedy|Drama,
38954,155121,Kshanam,(no genres listed),
49780,178267,Palestine Blues,Documentary|Thriller,
72893,236127,India From Above,(no genres listed),
61327,202769,RWBY: Volume 5,(no genres listed),


## Impute Years_Filled from Year, and then if missing: First Timestamp (tag or ratings)

In [23]:
import numpy as np
import pandas as pd

# 1) Ensure datetimes
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit="s")
tags["timestamp"]    = pd.to_datetime(tags["timestamp"],    unit="s")

# 2) Earliest per-movie timestamps from ratings and tags (no filtering)
first_rating_ts = ratings.groupby("movieId")["timestamp"].min().rename("first_rating_ts")
first_tag_ts    = tags.groupby("movieId")["timestamp"].min().rename("first_tag_ts")

# 3) Combine and compute first activity year
activity = pd.concat([first_rating_ts, first_tag_ts], axis=1)
activity["first_activity_ts"]   = activity.min(axis=1)
activity["first_activity_year"] = activity["first_activity_ts"].dt.year

# 4) Merge into movies (always creates the column)
movies = movies.merge(
    activity[["first_activity_year"]],
    left_on="movieId", right_index=True, how="left"
)

# 5) Create a clearly labeled filled year: prefer parsed year, fallback to activity year
if "year" not in movies.columns:
    # if you didn't already parse title years, do it quickly (robust)
    movies["year"] = movies["title"].str.extract(r"\((\d{4})").astype(float)

movies["year_filled"] = movies["year"]
mask = movies["year_filled"].isna() & movies["first_activity_year"].notna()
movies.loc[mask, "year_filled"] = movies.loc[mask, "first_activity_year"].astype(float)

# 6) Sanity checks
print("Movies total:", len(movies))
print("Missing parsed release year:", movies["year"].isna().sum())
print("Imputed with activity year:", mask.sum())
print(movies[["movieId","title","year","first_activity_year","year_filled"]].head())

print("Total Missing Years", movies["year"].isna().sum())

Movies total: 86537
Missing parsed release year: 617
Imputed with activity year: 617
   movieId                               title    year  first_activity_year  \
0        1                    Toy Story (1995)  1995.0                 1996   
1        2                      Jumanji (1995)  1995.0                 1996   
2        3             Grumpier Old Men (1995)  1995.0                 1996   
3        4            Waiting to Exhale (1995)  1995.0                 1996   
4        5  Father of the Bride Part II (1995)  1995.0                 1996   

   year_filled  
0       1995.0  
1       1995.0  
2       1995.0  
3       1995.0  
4       1995.0  
Total Missing Years 617


## Clean titles without the year

In [25]:
# movies["clean_title"] = movies["title"].str.replace(r'\s*\(\d{4}\)', '', regex=True)

movies["clean_title"] = movies["title"].str.replace(r'\s*\(\d{4}.*?\)', '', regex=True)

movies["clean_title"]

0                                          Toy Story
1                                            Jumanji
2                                   Grumpier Old Men
3                                  Waiting to Exhale
4                        Father of the Bride Part II
                            ...                     
86532                  State of Siege: Temple Attack
86533                                    Ouija Japan
86534      The Men Who Made the Movies: Howard Hawks
86535                       Skinford: Death Sentence
86536    UNZIPPED: An Autopsy of American Inequality
Name: clean_title, Length: 86537, dtype: object

## Genres

In [27]:
missing_genres_count = (
    movies["genres"].isna() | (movies["genres"] == "(no genres listed)")
).sum()

present_genres_count = (
    (~movies["genres"].isna()) & (movies["genres"] != "(no genres listed)")
).sum()

print(f"Missing Genres: {missing_genres_count} / {present_genres_count}")

Missing Genres: 7060 / 79477


## Use links to grab the movie genres for (no genres listed)
### 7f3c547002fd1c58b382273a23de3602 (api key)

In [29]:
import os

CACHE_PATH = "/kaggle/working/tmdb_genres_cache.json"
if os.path.exists(CACHE_PATH):
    os.remove(CACHE_PATH)
    print("Deleted old cache — will fetch all genres again.")


In [30]:
# If needed:
# !pip -q install aiohttp orjson

import asyncio, aiohttp, time, json, os, math, uuid, shutil
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from json import JSONDecodeError

API_KEY = "7f3c547002fd1c58b382273a23de3602"
BASE_URL = "https://api.themoviedb.org/3/movie/{tmdb_id}"
CACHE_PATH = "/kaggle/working/tmdb_genres_cache.json"

# ---------- Safe cache helpers ----------
def safe_load_cache(path):
    if not os.path.exists(path):
        return {}
    try:
        with open(path, "r") as f:
            return json.load(f)
    except JSONDecodeError:
        bad = f"{path}.bad-{uuid.uuid4().hex[:8]}"
        shutil.move(path, bad)
        print(f"[cache] Corrupted cache moved to: {bad}. Starting fresh.")
        return {}

def atomic_dump_json(path, obj):
    tmp = f"{path}.tmp-{uuid.uuid4().hex[:8]}"
    with open(tmp, "w") as f:
        json.dump(obj, f)
    os.replace(tmp, path)  # atomic on POSIX

# ---------- Ensure tmdbId present & normalized ----------
if "tmdbId" not in movies.columns:
    movies = movies.merge(links[["movieId","tmdbId"]].drop_duplicates("movieId"),
                          on="movieId", how="left", suffixes=("", "_links"))
    if "tmdbId_links" in movies.columns:
        movies["tmdbId"] = movies["tmdbId"].fillna(movies["tmdbId_links"])
        movies.drop(columns=["tmdbId_links"], inplace=True)

movies["tmdbId"] = pd.to_numeric(movies["tmdbId"], errors="coerce").astype("Int64")

# ---------- Build fetch list (only missing genres) ----------
fetch_df = movies[
    ((movies["genres"].isna()) | (movies["genres"] == "(no genres listed)"))
    & (movies["tmdbId"].notna())
][["movieId", "tmdbId"]].drop_duplicates()

fetch_df["tmdbId"] = fetch_df["tmdbId"].astype(int)

cache = safe_load_cache(CACHE_PATH)

to_fetch = [(int(r.tmdbId), int(r.movieId))
            for r in fetch_df.itertuples(index=False)
            if str(int(r.tmdbId)) not in cache]

print(f"Total needing genres: {len(fetch_df)}")
print(f"Cache hits: {len(fetch_df) - len(to_fetch)} | To fetch now: {len(to_fetch)}")

# ---------- Rate limiter (~35 rps to stay below TMDb guidance) ----------
class RateLimiter:
    def __init__(self, rate=35, bucket=35):
        self.rate = rate
        self.bucket = bucket
        self.tokens = bucket
        self.last = time.perf_counter()
        self.lock = asyncio.Lock()

    async def acquire(self):
        async with self.lock:
            now = time.perf_counter()
            elapsed = now - self.last
            self.tokens = min(self.bucket, self.tokens + elapsed * self.rate)
            self.last = now
            if self.tokens < 1:
                wait = (1 - self.tokens) / self.rate
                await asyncio.sleep(wait)
                self.tokens = 0
                self.last = time.perf_counter()
            self.tokens -= 1

limiter = RateLimiter(rate=35, bucket=35)

# ---------- Fetching ----------
async def fetch_one(session, tmdb_id):
    url = BASE_URL.format(tmdb_id=tmdb_id)
    params = {"api_key": API_KEY, "language": "en-US"}
    backoff = 1.0
    for attempt in range(7):
        await limiter.acquire()
        try:
            async with session.get(url, params=params, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                if resp.status == 200:
                    data = await resp.json()
                    genres = data.get("genres", [])
                    return "|".join([g["name"] for g in genres]) if genres else None
                if resp.status == 429:
                    ra = resp.headers.get("Retry-After")
                    sleep_s = int(ra) if (ra and ra.isdigit()) else max(2, math.ceil(backoff))
                    await asyncio.sleep(sleep_s)
                    backoff = min(backoff * 2, 30)
                    continue
                if 500 <= resp.status < 600:
                    await asyncio.sleep(backoff)
                    backoff = min(backoff * 2, 30)
                    continue
                return None  # other 4xx: give up
        except asyncio.TimeoutError:
            await asyncio.sleep(backoff); backoff = min(backoff * 2, 30)
        except Exception:
            await asyncio.sleep(backoff); backoff = min(backoff * 2, 30)
    return None

async def fetch_wrapped(session, tmdb):
    genres = await fetch_one(session, tmdb)
    return tmdb, genres

async def run_all(pairs, snapshot_every=1000):
    conn = aiohttp.TCPConnector(limit=200, ttl_dns_cache=300)
    headers = {"Accept-Encoding": "gzip, deflate"}
    async with aiohttp.ClientSession(connector=conn, headers=headers) as session:
        pbar = tqdm(total=len(pairs), desc="Fetching TMDb genres (async)", leave=False, dynamic_ncols=True)
        results_batch = {}
        chunk_size = 2000
        for i in range(0, len(pairs), chunk_size):
            chunk = pairs[i:i+chunk_size]
            tasks = [asyncio.create_task(fetch_wrapped(session, tmdb)) for tmdb, _ in chunk]
            for coro in asyncio.as_completed(tasks):
                tmdb, genres = await coro
                results_batch[str(tmdb)] = genres
                pbar.update(1)
                if pbar.n % snapshot_every == 0:
                    cache.update(results_batch)
                    atomic_dump_json(CACHE_PATH, cache)
                    results_batch.clear()
            if results_batch:
                cache.update(results_batch)
                atomic_dump_json(CACHE_PATH, cache)
                results_batch.clear()
        pbar.close()

# ---------- Run (Top-Level Await Safe) ----------
try:
    loop = asyncio.get_running_loop()
    running = loop.is_running()
except RuntimeError:
    running = False

if to_fetch:
    if running:
        # Notebook / Kaggle → use top-level await
        await run_all(to_fetch)
    else:
        # Script / IDE → standard asyncio.run
        asyncio.run(run_all(to_fetch))

# ---------- Map genres back to movies ----------
def map_genres(row):
    if row["genres"] != "(no genres listed)":
        return row["genres"]
    t = row["tmdbId"]
    if pd.isna(t):
        return row["genres"]
    return cache.get(str(int(t)), row["genres"])

movies["genres"] = movies.apply(map_genres, axis=1)


Total needing genres: 7059
Cache hits: 0 | To fetch now: 7059


Fetching TMDb genres (async):   0%|          | 0/7059 [00:00<?, ?it/s]

In [31]:
updated_movies = movies[movies["movieId"].astype(str).isin([str(v[1]) for v in to_fetch])]
updated_movies = updated_movies[updated_movies["genres"] != "(no genres listed)"]

print(f"Movies successfully updated with TMDb genres: {len(updated_movies)}")
updated_movies[["movieId", "title", "genres"]].head(10)


Movies successfully updated with TMDb genres: 7059


Unnamed: 0,movieId,title,genres
15884,83773,Away with Words (San tiao ren) (1999),Comedy|Drama
16063,84768,Glitterbug (1994),Music|TV Movie|Documentary
16354,86493,"Age of the Earth, The (A Idade da Terra) (1980)",Drama|Fantasy|History
16494,87061,Trails (Veredas) (1978),Drama|Fantasy
17410,91246,Milky Way (Tejút) (2007),
17712,92435,"Dancing Hawk, The (Tanczacy jastrzab) (1978)",Drama
17746,92641,Warsaw Bridge (Pont de Varsòvia) (1990),Drama
18081,94431,"Ella Lola, a la Trilby (1898)",
18296,95750,Promise of the Flesh (Yukcheui yaksok) (1975),Drama
18419,96479,Nocturno 29 (1968),Drama|Horror|Mystery


In [44]:
# movies = movies[movies["genres"] != "(no genres listed)"]

In [34]:
# 1) Global counts
print("NaN genres:", movies["genres"].isna().sum())
print("'(no genres listed)' genres:", (movies["genres"] == "(no genres listed)").sum())
print("Empty-string genres:", (movies["genres"] == "").sum())
print("Whitespace-only genres:", movies["genres"].str.fullmatch(r"\s*").fillna(False).sum())
print("Literal 'nan' string:", (movies["genres"].astype(str).str.lower() == "nan").sum())


NaN genres: 0
'(no genres listed)' genres: 0
Empty-string genres: 0
Whitespace-only genres: 0
Literal 'nan' string: 0


In [33]:
movies["genres"] = movies["genres"].fillna("Unknown")
movies["genres"] = movies["genres"].replace("(no genres listed)", "Unknown")

## Split Genres

In [37]:
movies["genres"] = movies["genres"].apply(lambda x: x.split('|') if isinstance(x, str) else [])

movies["genres"]

0        [Adventure, Animation, Children, Comedy, Fantasy]
1                           [Adventure, Children, Fantasy]
2                                        [Comedy, Romance]
3                                 [Comedy, Drama, Romance]
4                                                 [Comedy]
                               ...                        
86532                                      [Action, Drama]
86533                                     [Action, Horror]
86534                                        [Documentary]
86535                                    [Crime, Thriller]
86536                                        [Documentary]
Name: genres, Length: 86537, dtype: object

In [38]:
updated_movies = movies[movies["movieId"].astype(str).isin([str(v[1]) for v in to_fetch])]
updated_movies = updated_movies[updated_movies["genres"] != "(no genres listed)"]

print(f"Movies successfully updated with TMDb genres: {len(updated_movies)}")
updated_movies[["movieId", "title", "genres"]].head(10)

Movies successfully updated with TMDb genres: 7059


Unnamed: 0,movieId,title,genres
15884,83773,Away with Words (San tiao ren) (1999),"[Comedy, Drama]"
16063,84768,Glitterbug (1994),"[Music, TV Movie, Documentary]"
16354,86493,"Age of the Earth, The (A Idade da Terra) (1980)","[Drama, Fantasy, History]"
16494,87061,Trails (Veredas) (1978),"[Drama, Fantasy]"
17410,91246,Milky Way (Tejút) (2007),[Unknown]
17712,92435,"Dancing Hawk, The (Tanczacy jastrzab) (1978)",[Drama]
17746,92641,Warsaw Bridge (Pont de Varsòvia) (1990),[Drama]
18081,94431,"Ella Lola, a la Trilby (1898)",[Unknown]
18296,95750,Promise of the Flesh (Yukcheui yaksok) (1975),[Drama]
18419,96479,Nocturno 29 (1968),"[Drama, Horror, Mystery]"


## Saving Movies_Clean

In [None]:
# movies.to_csv("/kaggle/working/movies_clean.csv", index=False)

In [51]:
# movies = pd.read_csv("/kaggle/input/modelingdataset/movies_clean.csv")

In [52]:
from sklearn.preprocessing import MultiLabelBinarizer

# If genres are still strings like "['Comedy', 'Drama']" — convert to actual lists
movies["genres"] = movies["genres"].apply(
    lambda x: eval(x) if isinstance(x, str) and x.startswith("[") else x
)

mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(
    mlb.fit_transform(movies["genres"]),
    columns=mlb.classes_,
    index=movies["movieId"]
)

genre_encoded.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,Unknown,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,1,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
genome = genome_scores.merge(genome_tags, on="tagId", how="left")
genome.head()

Unnamed: 0,movieId,tagId,relevance,tag
0,1,1,0.032,007
1,1,2,0.02225,007 (series)
2,1,3,0.07,18th century
3,1,4,0.059,1920s
4,1,5,0.123,1930s


In [54]:
genome_matrix = genome.pivot(
    index="movieId",
    columns="tag",
    values="relevance"
).fillna(0)

In [55]:
genome_matrix.head()

tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.032,0.02225,0.07,0.059,0.123,0.131,0.06175,0.1955,0.26625,0.033,...,0.04125,0.0155,0.03775,0.035,0.11525,0.0435,0.0405,0.033,0.077,0.01825
2,0.0325,0.032,0.0405,0.051,0.1005,0.0635,0.19825,0.07525,0.0975,0.06325,...,0.041,0.02025,0.011,0.01875,0.13425,0.0185,0.01825,0.00825,0.0875,0.01375
3,0.0415,0.05525,0.02125,0.07225,0.05125,0.044,0.03,0.10375,0.023,0.0325,...,0.06175,0.01825,0.02375,0.017,0.124,0.02775,0.016,0.01325,0.10075,0.018
4,0.0315,0.034,0.028,0.02725,0.06475,0.03325,0.027,0.069,0.0395,0.02525,...,0.06175,0.0235,0.01825,0.03075,0.17725,0.0395,0.01925,0.011,0.10225,0.01375
5,0.0395,0.052,0.02575,0.04475,0.073,0.05375,0.021,0.0505,0.01875,0.02225,...,0.05575,0.0175,0.0155,0.0165,0.1775,0.01975,0.01875,0.009,0.0905,0.01475


In [57]:
# Merge genre features
movies_full = movies.merge(genre_encoded, on="movieId", how="left")

# Merge Tag Genome features (genome_matrix)
movies_full = movies_full.merge(genome_matrix, on="movieId", how="left")

# # Fill any missing numerical values
movies_full = movies_full.fillna(0)

movies_full.isna().sum()

movieId                0
title                  0
genres                 0
year                   0
first_activity_year    0
                      ..
writing                0
wuxia                  0
wwii                   0
zombie                 0
zombies                0
Length: 1161, dtype: int64

In [63]:
# Replace year with the imputed year_filled
movies_full["year"] = movies_full["year_filled"]

# Drop redundant columns
movies_full = movies_full.drop(columns=["genres", "year_filled", "first_activity_year"], errors="ignore")

# Sanity check
print("Remaining columns:", movies_full.columns.tolist()[:12])  # show first few


Remaining columns: ['movieId', 'title', 'year', 'clean_title', 'tmdbId', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary']


In [64]:
print("Shape before merge:", movies.shape)
print("Shape after merge:", movies_full.shape)

movies_full.head(2)

Shape before merge: (86537, 8)
Shape after merge: (86537, 1158)


Unnamed: 0,movieId,title,year,clean_title,tmdbId,Action,Adventure,Animation,Children,Comedy,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
0,1,Toy Story (1995),1995.0,Toy Story,862.0,0,1,1,1,1,...,0.04125,0.0155,0.03775,0.035,0.11525,0.0435,0.0405,0.033,0.077,0.01825
1,2,Jumanji (1995),1995.0,Jumanji,8844.0,0,1,0,1,0,...,0.041,0.02025,0.011,0.01875,0.13425,0.0185,0.01825,0.00825,0.0875,0.01375


In [65]:
movies_full.isna().sum()

movieId        0
title          0
year           0
clean_title    0
tmdbId         0
              ..
writing        0
wuxia          0
wwii           0
zombie         0
zombies        0
Length: 1158, dtype: int64

## Saving Movies_Full

In [66]:
# movies_full.to_csv("/kaggle/working/movies_full.csv", index=False)

# Ratings_Full Dataset (Collaborative Filtering)
## Movies + Ratings + Tags

In [68]:
ratings_full = ratings.merge(movies, on="movieId", how="left")

In [70]:
# 1) Collapse to a single year and title
ratings_full["year"]  = ratings_full["year_filled"]
ratings_full["title"] = ratings_full["clean_title"]

# 2) Drop redundant/merge-artifact columns
ratings_full = ratings_full.drop(
    columns=[
        "year_x", "year_y", "year_filled",
        "genres",
        "clean_title",
        "first_activity_year",
        "month", "day"
    ],
    errors="ignore"
)

cols = ["userId", "movieId", "rating", "timestamp", "year", "title"]
cols = [c for c in cols if c in ratings_full.columns]
ratings_full = ratings_full[cols]

ratings_full.head()


Unnamed: 0,userId,movieId,rating,timestamp,year,title
0,1,1,4.0,2008-11-03 17:52:19,1995.0,Toy Story
1,1,110,4.0,2008-11-05 06:04:46,1995.0,Braveheart
2,1,158,4.0,2008-11-03 17:31:43,1995.0,Casper
3,1,260,4.5,2008-11-03 18:00:04,1977.0,Star Wars: Episode IV - A New Hope
4,1,356,5.0,2008-11-03 17:58:39,1994.0,Forrest Gump


## Saving Ratings_Full

In [71]:
# ratings_full.to_csv("/kaggle/working/ratings_full.csv", index=False)

### EDA

In [None]:
print(ratings["rating"].describe())
sns.histplot(ratings["rating"], bins=20, kde=True)
plt.title("Distribution of Ratings")
plt.show()
movie_ratings = ratings.groupby("movieId")["rating"].agg(["count", "mean"]).reset_index()
top_movies = movie_ratings.merge(movies, on="movieId").sort_values(by="count", ascending=False).head(10)

plt.barh(top_movies["clean_title"], top_movies["count"])
plt.gca().invert_yaxis()
plt.title("Top 10 Most Rated Movies")
plt.xlabel("Number of Ratings")
plt.show()


In [None]:
genre_ratings = (
    movies.explode("genres")
    .merge(ratings, on="movieId")
    .groupby("genres")["rating"]
    .mean()
    .sort_values(ascending=False)
)

genre_ratings.plot(kind="bar", figsize=(10, 5))
plt.title("Average Rating by Genre")
plt.ylabel("Mean Rating")
plt.show()


In [None]:
ratings["year"] = ratings["timestamp"].dt.year
ratings_per_year = ratings.groupby("year")["rating"].mean()

ratings_per_year.plot(marker='o')
plt.title("Average Rating Over Time")
plt.xlabel("Year")
plt.ylabel("Average Rating")
plt.show()


## Genome Dataset

In [None]:
# Merge genome tags and scores
tag_genome = genome_scores.merge(genome_tags, on="tagId")
tag_genome

In [None]:
top_tags = tag_genome.groupby("tag")["relevance"].mean().sort_values(ascending=False).head(15)

top_tags.plot(kind="barh", title="Top Tags by Mean Relevance")
plt.show()

In [None]:
summary = {
    "Movies": movies.shape,
    "Ratings": ratings.shape,
    "Tags": tags.shape,
    "Links": links.shape,
    "Genome Tags": genome_tags.shape,
    "Genome Scores": genome_scores.shape,
}
pd.DataFrame(summary.items(), columns=["Table", "Shape"])


## Ratings Dataset

In [None]:
# Merge ratings with movies metadata
ratings_full = ratings.merge(movies, on="movieId", how="left")

# Drop the redundant ratings year (if present)
ratings_full = ratings_full.drop(columns=["year_x"], errors="ignore")

# Rename the movie year to just "year"
ratings_full = ratings_full.rename(columns={"year_y": "year"})

ratings_full.head()


In [None]:
ratings_full.isna().sum()


## Sparsity Workaround for Modeling

In [None]:
from scipy.sparse import csr_matrix

user_mapper = {u: i for i, u in enumerate(ratings_full['userId'].unique())}
movie_mapper = {m: i for i, m in enumerate(ratings_full['movieId'].unique())}

user_index = [user_mapper[u] for u in ratings_full['userId']]
movie_index = [movie_mapper[m] for m in ratings_full['movieId']]

R = csr_matrix((ratings_full['rating'], (user_index, movie_index)),
               shape=(len(user_mapper), len(movie_mapper)))
R


In [None]:
user_means = user_item_matrix.mean(axis=1)
user_item_centered = user_item_matrix.sub(user_means, axis=0)


In [None]:
sparsity = 1 - (np.count_nonzero(~user_item_matrix.isna()) /
                float(user_item_matrix.size))
print(f"Sparsity: {sparsity:.2%}")


In [None]:
movie_stats = ratings_full.groupby("movieId").agg(
    mean_rating=("rating", "mean"),
    num_ratings=("rating", "count")
).reset_index()

movie_stats = movie_stats.merge(movies, on="movieId", how="left")
movie_stats.sort_values(by="num_ratings", ascending=False).head()


# Modeling

## Collaborative Filtering (CF)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(movies["genres"])
genre_df = pd.DataFrame(genre_features, columns=mlb.classes_, index=movies["movieId"])
genre_df.head()


In [None]:
genome = genome_scores.merge(genome_tags, on="tagId", how="left")

# Pivot to get movieId × tag relevance matrix
genome_matrix = genome.pivot(index="movieId", columns="tag", values="relevance").fillna(0)
genome_matrix.head()


In [None]:
movie_features = genre_df.join(genome_matrix, how="outer").fillna(0)
movie_features.shape


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity between movies using content features
movie_similarity = cosine_similarity(movie_features)
movie_similarity_df = pd.DataFrame(
    movie_similarity,
    index=movie_features.index,
    columns=movie_features.index
)


In [None]:
# Find top-10 movies similar to Toy Story (movieId = 1)
movie_similarity_df.loc[1].sort_values(ascending=False).head(10)


In [None]:
def build_user_profiles(ratings_df, feature_df, min_rating=4.0):
    user_profiles = {}
    for user, group in ratings_df.groupby("userId"):
        liked_movies = group[group["rating"] >= min_rating]["movieId"]
        if not liked_movies.empty:
            profile = feature_df.loc[liked_movies].mean()
            user_profiles[user] = profile
    return pd.DataFrame(user_profiles).T

user_profiles = build_user_profiles(ratings_full, movie_features)
user_profiles.head()


In [None]:
user_item_centered.to_parquet("/kaggle/working/user_item_centered.parquet")
movie_features.to_parquet("/kaggle/working/movie_features.parquet")
user_profiles.to_parquet("/kaggle/working/user_profiles.parquet")


In [None]:
plt.figure(figsize=(8,5))
sns.histplot(ratings_full["rating"], bins=10, kde=True)
plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()


In [None]:
movie_counts = ratings_full.groupby("movieId")["rating"].count()
plt.figure(figsize=(8,5))
sns.histplot(movie_counts, bins=100, log_scale=True)
plt.title("Distribution of Number of Ratings per Movie (log scale)")
plt.xlabel("Number of Ratings")
plt.ylabel("Count of Movies")
plt.show()


In [None]:
movie_stats = ratings_full.groupby("movieId")["rating"].agg(["mean", "count"])
plt.figure(figsize=(8,5))
sns.scatterplot(data=movie_stats, x="count", y="mean", alpha=0.5)
plt.xscale("log")
plt.title("Average Rating vs. Rating Count")
plt.xlabel("Number of Ratings (log scale)")
plt.ylabel("Average Rating")
plt.show()


In [None]:
ratings_full["year_rated"] = ratings_full["timestamp"].dt.year
ratings_per_year = ratings_full.groupby("year_rated")["rating"].mean()

plt.figure(figsize=(10,5))
sns.lineplot(x=ratings_per_year.index, y=ratings_per_year.values)
plt.title("Average Rating Over Time")
plt.xlabel("Year Rated")
plt.ylabel("Average Rating")
plt.show()


## Content-Based Filtering (CBF)

In [None]:
from collections import Counter
genre_counts = Counter(g for gs in movies["genres"] for g in gs if g != "(no genres listed)")
pd.Series(genre_counts).sort_values(ascending=False).plot(kind="bar", figsize=(10,5))
plt.title("Number of Movies per Genre")
plt.ylabel("Count")
plt.show()


In [None]:
genre_ratings = (
    movies.explode("genres")
    .merge(ratings, on="movieId")
    .groupby("genres")["rating"]
    .mean()
    .sort_values(ascending=False)
)

genre_ratings.plot(kind="bar", figsize=(10,5))
plt.title("Average Rating by Genre")
plt.ylabel("Mean Rating")
plt.show()


In [None]:
user_counts = ratings_full.groupby("userId")["rating"].count()
sns.histplot(user_counts, bins=50, log_scale=True)
plt.title("Distribution of Number of Ratings per User (log scale)")
plt.xlabel("Number of Ratings per User")
plt.ylabel("Count of Users")
plt.show()


In [None]:
yearly_ratings = ratings_full.groupby("year")["rating"].mean()
plt.figure(figsize=(10,5))
sns.lineplot(x=yearly_ratings.index, y=yearly_ratings.values)
plt.title("Average Rating by Release Year")
plt.xlabel("Release Year")
plt.ylabel("Average Rating")
plt.show()


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_matrix = pd.DataFrame(
    mlb.fit_transform(movies["genres"]),
    columns=mlb.classes_,
    index=movies["movieId"]
)

corr_genre = genre_matrix.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr_genre, cmap="coolwarm", center=0)
plt.title("Genre Co-occurrence Heatmap")
plt.show()
