In [7]:
import pandas as pd
from datetime import datetime
# Define a function to load a gzipped TSV file in chunks
def load_gzipped_tsv_in_chunks(file_path, chunk_size=10000):
    chunks = []
    for chunk in pd.read_csv(file_path, delimiter='\t', encoding='utf-8', chunksize=chunk_size):
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)



In [3]:
title_basics = load_gzipped_tsv_in_chunks('imdb_flat_files/filtered_title_basics.tsv.gz')
display(title_basics.head())

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
title_ratings = load_gzipped_tsv_in_chunks('imdb_flat_files/title.ratings.tsv.gz')
display(title_ratings.head())

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2058
1,tt0000002,5.7,276
2,tt0000003,6.5,2022
3,tt0000004,5.4,179
4,tt0000005,6.2,2786


In [10]:
# Filter movies from the last three years
# Convert the startYear column to numeric, forcing errors to NaN and then dropping them
title_basics['startYear'] = pd.to_numeric(title_basics['startYear'], errors='coerce')

# Drop rows with NaN values in the startYear column
title_basics = title_basics.dropna(subset=['startYear'])

# Convert startYear to integer
title_basics['startYear'] = title_basics['startYear'].astype(int)

# Get the current year
current_year = datetime.now().year

# Filter movies from the last three years
last_three_years = title_basics[
    (title_basics['startYear'] >= current_year - 3) & 
    (title_basics['startYear'] <= current_year) & 
    (title_basics['titleType'] == 'movie')
]

# Display the filtered list of movies
display(last_three_years.head())

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13079,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,\N,94,Documentary
69146,tt0070596,movie,Socialist Realism,El realismo socialista,0,2023,\N,78,Drama
76047,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022,\N,46,Documentary
94097,tt0096235,movie,Taxi Killer,Taxi Killer,0,2022,\N,106,"Action,Crime,Drama"
95587,tt0097767,movie,Loading Ludwig,Loading Ludwig,0,2022,\N,65,\N


In [12]:
# Merge the filtered movies with the ratings data on tconst
merged_df = pd.merge(last_three_years, title_ratings, on='tconst', how='left')

# Save the resulting DataFrame as a pickle file
merged_df.to_pickle('last_three_years_movies_with_ratings.pkl')

In [14]:
display(merged_df.head())
display(merged_df.count())

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,\N,94,Documentary,6.6,71.0
1,tt0070596,movie,Socialist Realism,El realismo socialista,0,2023,\N,78,Drama,7.5,59.0
2,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022,\N,46,Documentary,,
3,tt0096235,movie,Taxi Killer,Taxi Killer,0,2022,\N,106,"Action,Crime,Drama",5.6,73.0
4,tt0097767,movie,Loading Ludwig,Loading Ludwig,0,2022,\N,65,\N,7.0,6.0


tconst            68006
titleType         68006
primaryTitle      68006
originalTitle     68006
isAdult           68006
startYear         68006
endYear           68006
runtimeMinutes    68006
genres            68006
averageRating     32777
numVotes          32777
dtype: int64