In [1]:
'''
Tasks
- Replace gl movieId with tmbd id
- Add new 2023-2025 movies in from Letterboxd with new uniqueIds and tmdb ids for movies
'''

'\nTasks\n- Replace gl movieId with tmbd id\n- Add new 2023-2025 movies in from Letterboxd with new uniqueIds and tmdb ids for movies\n'

In [2]:
import os
import pandas as pd
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client(project="film-wizard-453315")
print("Authenticated successfully!")

Authenticated successfully!


## Existing GL ratings plus tmdb ids

In [3]:
# Define batch size
batch_size = 1_000_000  # Adjust as needed
offset = 0
all_data = []

while True:
    # Define query with LIMIT and OFFSET
    sample_query = f"""
    SELECT userId, movieId, rating
    FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
    LIMIT {batch_size} OFFSET {offset}
    """
    
    # Fetch batch
    batch_df = client.query(sample_query).to_dataframe()
    
    # Break loop if no more data
    if batch_df.empty:
        print("No more data to fetch. Stopping batch process.")
        break
    
    # Append to list
    all_data.append(batch_df)
    
    # Confirm batch retrieval
    print(f"Batch added: {len(batch_df)} rows (Offset: {offset})")
    
    # Move offset
    offset += batch_size

# Concatenate all batches into final DataFrame
gl_df = pd.concat(all_data, ignore_index=True)

# Display final result
gl_df



Batch added: 1000000 rows (Offset: 0)
Batch added: 1000000 rows (Offset: 1000000)
Batch added: 1000000 rows (Offset: 2000000)
Batch added: 1000000 rows (Offset: 3000000)
Batch added: 1000000 rows (Offset: 4000000)
Batch added: 1000000 rows (Offset: 5000000)
Batch added: 1000000 rows (Offset: 6000000)
Batch added: 1000000 rows (Offset: 7000000)
Batch added: 1000000 rows (Offset: 8000000)
Batch added: 1000000 rows (Offset: 9000000)
Batch added: 1000000 rows (Offset: 10000000)
Batch added: 1000000 rows (Offset: 11000000)
Batch added: 1000000 rows (Offset: 12000000)
Batch added: 1000000 rows (Offset: 13000000)
Batch added: 1000000 rows (Offset: 14000000)
Batch added: 1000000 rows (Offset: 15000000)
Batch added: 1000000 rows (Offset: 16000000)
Batch added: 1000000 rows (Offset: 17000000)
Batch added: 1000000 rows (Offset: 18000000)
Batch added: 1000000 rows (Offset: 19000000)
Batch added: 1000000 rows (Offset: 20000000)
Batch added: 1000000 rows (Offset: 21000000)
Batch added: 1000000 rows 

Unnamed: 0,userId,movieId,rating
0,196806,1,0.5
1,115637,1,0.5
2,133978,1,0.5
3,170909,1,0.5
4,106965,1,0.5
...,...,...,...
32000199,85475,288311,5.0
32000200,103008,288513,5.0
32000201,29875,288987,5.0
32000202,188512,290315,5.0


In [4]:
# Fetch links table
sample_query = """
SELECT *
FROM `film-wizard-453315.Grouplens.raw_grouplens_links`
"""
links_df = client.query(sample_query).to_dataframe()
links_df



Unnamed: 0,movieId,imdbId,tmdbId
0,4470,94675,2
1,61724,92149,3
2,18,113101,5
3,479,107286,6
4,260,76759,11
...,...,...,...
87456,290775,10644684,1179468
87457,291873,28769402,1181568
87458,291775,28650765,1181806
87459,291787,11027288,1182286


In [5]:
initial_gl_tmdb_df = pd.merge(gl_df, links_df, on='movieId', how='inner') #163 rows don't have tmdb ids so are dropped if using 'inner'. This is 28 unique movies.
initial_gl_tmdb_df = initial_gl_tmdb_df[["userId", "movieId", "tmdbId", "rating"]]
initial_gl_tmdb_df

## To check how many unique gl movieIds are being lost with left vs inner join
# nan_movie_ids_count = initial_gl_tmdb_df[initial_gl_tmdb_df['tmdbId'].isna()]['movieId'].nunique()
# nan_movie_ids_count

Unnamed: 0,userId,movieId,tmdbId,rating
0,196806,1,862,0.5
1,115637,1,862,0.5
2,133978,1,862,0.5
3,170909,1,862,0.5
4,106965,1,862,0.5
...,...,...,...,...
31995517,85475,288311,961323,5.0
31995518,103008,288513,346698,5.0
31995519,29875,288987,876797,5.0
31995520,188512,290315,176439,5.0


## Letterboxd ratings plus tmdb ids

In [6]:
# Fetch Letterboxd reviews
sample_query = """
SELECT *
FROM `film-wizard-453315.Letterboxd.letterboxed_user_reviews`
"""
letterboxd_df = client.query(sample_query).to_dataframe()
letterboxd_df['rating_val'] = letterboxd_df['rating_val']/2
letterboxd_df



Unnamed: 0,user_id,movie_id,rating_val
0,w0cks,robot-in-the-family,0.5
1,dreyuhsanchez,step-up-2-the-streets,0.5
2,retjgeesxvg,three-colours-white,0.5
3,NaokiiLol,shes-all-that,0.5
4,asilversnake,unfriended,0.5
...,...,...,...
1199995,critconc,an-average-little-man,4.5
1199996,critconc,nest-2022,4.5
1199997,critconc,song-of-the-sea,4.5
1199998,critconc,das-boot-1985,5.0


In [7]:
# Fetch tmdb data
sample_query = """
SELECT tmdbId, title, release_date
FROM `film-wizard-453315.tmdb_metadata.all_movies_combined`
"""
tmdb_df = client.query(sample_query).to_dataframe()

print('')

tmdb_df['release_date'] = pd.to_datetime(tmdb_df['release_date'])

# 1) Get the data types of each column
print("Data types of each column:")
print(tmdb_df.dtypes)

print('')

# 2) Count the non-NaN values
release_date_count = tmdb_df['title'].count()
print("Non-NaN values in title column:", release_date_count)

release_date_count = tmdb_df['release_date'].count()
print("Non-NaN values in release_date column:", release_date_count)

tmdb_df




Data types of each column:
tmdbId                   Int64
title                   object
release_date    datetime64[ns]
dtype: object

Non-NaN values in title column: 108680
Non-NaN values in release_date column: 108631


Unnamed: 0,tmdbId,title,release_date
0,122662,Mardock Scramble: The Third Exhaust,NaT
1,260477,Freedom River,NaT
2,458815,Dhogs,NaT
3,82663,Midnight Man,NaT
4,368224,Time of tears,NaT
...,...,...,...
108686,1241982,Moana 2,2024-11-21
108687,762509,Mufasa: The Lion King,2024-12-18
108688,1064213,Anora,2024-10-14
108689,1126166,Flight Risk,2025-01-22


In [8]:
from rapidfuzz import process, fuzz
import pandas as pd
import re
from tqdm import tqdm  # Import tqdm for progress tracking

# Function to clean movie titles for better fuzzy matching
def clean_title(title):
    """
    Cleans a movie title by:
    - Converting to lowercase
    - Replacing hyphens with spaces
    - Removing all characters except letters, numbers, and spaces
    """
    if not isinstance(title, str):  # Handle None values
        return ""
    title = title.lower()
    title = title.replace(" - ", " ")  # Normalize spaces around hyphens
    title = title.replace("-", " ")    # Convert remaining hyphens to spaces
    title = re.sub(r'[^a-z0-9 ]', '', title)  # Remove punctuation
    return title.strip()

# Function to extract the release year from the title
def extract_year(title):
    """
    Extracts a 4-digit year from a movie title, if present.
    Returns the year as an integer or None if no year is found.
    """
    if not isinstance(title, str):  # Handle None values
        return None
    match = re.search(r'\b(19|20)\d{2}\b', title)  # Look for years between 1900-2099
    return int(match.group()) if match else None

# Apply preprocessing to clean movie titles
letterboxd_df['clean_movie_id'] = letterboxd_df['movie_id'].apply(clean_title)
tmdb_df['clean_title'] = tmdb_df['title'].apply(clean_title)

# Extract years from both dataframes
letterboxd_df['year'] = letterboxd_df['movie_id'].apply(extract_year)
tmdb_df['year'] = tmdb_df['title'].apply(extract_year)

# Convert TMDB titles into a list for faster lookup
tmdb_titles = tmdb_df['clean_title'].tolist()

# Lists to store match results
matched_titles = []
fuzzy_scores = []
matched_tmdb_ids = []

# Perform fuzzy matching for each movie in the Letterboxd dataset with a progress bar
for index, row in tqdm(letterboxd_df.iterrows(), total=len(letterboxd_df), desc="Matching titles"):
    movie_id = row['clean_movie_id']
    movie_year = row['year']  # Extract year for additional filtering

    if not movie_id:  # Skip empty movie IDs
        matched_titles.append(None)
        fuzzy_scores.append(None)
        matched_tmdb_ids.append(None)
        continue

    # Get top 5 matches using a strict ratio-based scoring system
    matches = process.extract(movie_id, tmdb_titles, scorer=fuzz.ratio, limit=5)

    # Filter to only matches that score above 90 (strong match)
    valid_matches = [match for match in matches if match[1] >= 90]

    # If a release year is available, prefer matches with the same year
    if movie_year:
        year_matched_df = tmdb_df[
            (tmdb_df['clean_title'].isin([match[0] for match in valid_matches])) & 
            (tmdb_df['year'] == movie_year)
        ]
        if not year_matched_df.empty:
            best_match = (year_matched_df.iloc[0]['clean_title'], 100)  # Assign perfect score
        else:
            best_match = valid_matches[0] if valid_matches else matches[0]
    else:
        best_match = valid_matches[0] if valid_matches else matches[0]

    # Retrieve the original TMDB title and ID
    match_row = tmdb_df.loc[tmdb_df['clean_title'] == best_match[0]]
    matched_titles.append(match_row['title'].values[0] if not match_row.empty else None)
    fuzzy_scores.append(best_match[1])
    matched_tmdb_ids.append(match_row['tmdbId'].values[0] if not match_row.empty else None)

# Add match results to DataFrame
letterboxd_df['matched_title'] = matched_titles
letterboxd_df['fuzzy_score'] = fuzzy_scores
letterboxd_df['matched_tmdbId'] = matched_tmdb_ids  # Add tmdbId to final DataFrame

# Display the first 50 results
letterboxd_df.head(50)

Matching titles: 100%|█████████████| 1200000/1200000 [10:37:56<00:00, 31.35it/s]


Unnamed: 0,user_id,movie_id,rating_val,clean_movie_id,year,matched_title,fuzzy_score,matched_tmdbId
0,w0cks,robot-in-the-family,0.5,robot in the family,,In the Family,81.25,82929
1,dreyuhsanchez,step-up-2-the-streets,0.5,step up 2 the streets,,Step Up 2: The Streets,100.0,8328
2,retjgeesxvg,three-colours-white,0.5,three colours white,,Three Colors: White,97.297297,109
3,NaokiiLol,shes-all-that,0.5,shes all that,,She's All That,100.0,10314
4,asilversnake,unfriended,0.5,unfriended,,Unfriended,100.0,277685
5,jimmyjone,acrimony,0.5,acrimony,,Acrimony,100.0,464502
6,Maartendehaan,amsterdam-2022,0.5,amsterdam 2022,2022.0,Amsterdam,78.26087,664469
7,charmofhappines,the-drop-2014,0.5,the drop 2014,2014.0,The Drop,76.190476,85792
8,Kettik,your-name,0.5,your name,,Your Name.,100.0,372058
9,giovannafiorio,the-munsters,0.5,the munsters,,The Munsters,100.0,804413


In [9]:
letterboxd_df = letterboxd_df[['user_id', 'matched_tmdbId', 'rating_val']]

In [10]:
# Example: Get the maximum existing userId from gl_df
max_user_id = gl_df['userId'].max()

# Create a mapping of unique user_id values to new sequential numbers
unique_users = letterboxd_df['user_id'].unique()
user_id_mapping = {user: idx for idx, user in enumerate(unique_users, start=max_user_id + 1)}

# Apply the mapping to the user_id column
letterboxd_df['new_user_id'] = letterboxd_df['user_id'].map(user_id_mapping)

# Display the updated dataframe
letterboxd_df = letterboxd_df[['new_user_id', 'matched_tmdbId', 'rating_val']]
letterboxd_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  letterboxd_df['new_user_id'] = letterboxd_df['user_id'].map(user_id_mapping)


Unnamed: 0,new_user_id,matched_tmdbId,rating_val
0,200949,82929,0.5
1,200950,8328,0.5
2,200951,109,0.5
3,200952,10314,0.5
4,200953,277685,0.5
...,...,...,...
1199995,228240,54561,4.5
1199996,228240,1418384,4.5
1199997,228240,110416,4.5
1199998,228240,387,5.0


In [11]:
letterboxd_df = letterboxd_df.rename(columns={'new_user_id': 'userId', 
                                      'matched_tmdbId': 'movieId', 
                                      'rating_val': 'rating'})
master_df = pd.concat([gl_df, letterboxd_df], ignore_index=True)
master_df

Unnamed: 0,userId,movieId,rating
0,196806,1,0.5
1,115637,1,0.5
2,133978,1,0.5
3,170909,1,0.5
4,106965,1,0.5
...,...,...,...
33200199,228240,54561,4.5
33200200,228240,1418384,4.5
33200201,228240,110416,4.5
33200202,228240,387,5.0


In [12]:
master_df.to_csv("master_df.csv", index=False)

In [13]:
!pwd

/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/notebooks
