In [8]:
import pandas as pd
import gzip
from datetime import datetime
import shutil
import os

def load_gzipped_tsv_in_chunks(file_path, chunk_size=10000):
    chunks = []
    for chunk in pd.read_csv(file_path, delimiter='\t', encoding='utf-8', chunksize=chunk_size):
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

# Note: The large input file should be downloaded from https://datasets.imdbws.com/
input_file = 'imdb_flat_files/title.basics.tsv.gz'
output_file = 'imdb_flat_files/filtered_title_basics.tsv.gz'

# Read the compressed TSV file
df = load_gzipped_tsv_in_chunks(input_file)

# Get the current year
current_year = datetime.now().year
# Calculate the threshold year
threshold_year = current_year - 10

# Convert 'startYear' to numeric (Int64), forcing errors to NaN
df['startYear'] = pd.to_numeric(df['startYear'], errors='coerce').astype('Int64')
filtered_df = df[(df['startYear'] >= threshold_year) & (df['startYear'].notna())]
print(filtered_df.head())

# Save the filtered DataFrame to a TSV file
filtered_tsv_file = 'filtered_title_basics.tsv'
filtered_df.to_csv(filtered_tsv_file, sep='\t', index=False, encoding='utf-8')

# Compress the TSV file
with open(filtered_tsv_file, 'rb') as f_in:
    with gzip.open(output_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Optionally, remove the intermediate TSV file
os.remove(filtered_tsv_file)

print(f"Filtered and compressed file saved as {output_file}")

# Reload filtered data for merging
title_basics = filtered_df
title_ratings = load_gzipped_tsv_in_chunks('imdb_flat_files/title.ratings.tsv.gz')
display(title_ratings.head())

# Merge the filtered movies with the ratings data on tconst
merged_df = pd.merge(filtered_df, title_ratings, on='tconst', how='left')

# Filter to include only movies
display(merged_df.info())
merged_df = merged_df[merged_df['titleType'] == 'movie']
display(merged_df.info())

threshold_year = current_year - 3 
merged_df = merged_df[(merged_df['startYear'] >= threshold_year) & (merged_df['startYear'].notna())]
print(merged_df.head())

# Save the resulting DataFrame as a pickle file
merged_df.to_pickle('last_three_years_movies_with_ratings.pkl')

          tconst titleType                      primaryTitle  \
11634  tt0011801     movie                  Tötet nicht mehr   
13079  tt0013274     movie       Istoriya grazhdanskoy voyny   
55749  tt0056840     short                          Aufsätze   
56272  tt0057369     short  Number 14: Late Superimpositions   
59194  tt0060361     short                          EMS nr 1   

                          originalTitle isAdult  startYear endYear  \
11634                  Tötet nicht mehr       0       2019      \N   
13079       Istoriya grazhdanskoy voyny       0       2021      \N   
55749                          Aufsätze       0       2021      \N   
56272  Number 14: Late Superimpositions       0       2023      \N   
59194                          EMS nr 1       0       2016      \N   

      runtimeMinutes        genres  
11634             \N  Action,Crime  
13079             94   Documentary  
55749             10         Short  
56272             30         Short  
59194    

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2058
1,tt0000002,5.7,276
2,tt0000003,6.5,2022
3,tt0000004,5.4,179
4,tt0000005,6.2,2786


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4479672 entries, 0 to 4479671
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   isAdult         object 
 5   startYear       Int64  
 6   endYear         object 
 7   runtimeMinutes  object 
 8   genres          object 
 9   averageRating   float64
 10  numVotes        float64
dtypes: Int64(1), float64(2), object(8)
memory usage: 414.4+ MB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195091 entries, 0 to 4479654
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          195091 non-null  object 
 1   titleType       195091 non-null  object 
 2   primaryTitle    195091 non-null  object 
 3   originalTitle   195091 non-null  object 
 4   isAdult         195091 non-null  object 
 5   startYear       195091 non-null  Int64  
 6   endYear         195091 non-null  object 
 7   runtimeMinutes  195091 non-null  object 
 8   genres          195091 non-null  object 
 9   averageRating   101899 non-null  float64
 10  numVotes        101899 non-null  float64
dtypes: Int64(1), float64(2), object(8)
memory usage: 18.0+ MB


None

       tconst titleType                       primaryTitle  \
1   tt0013274     movie        Istoriya grazhdanskoy voyny   
9   tt0070596     movie                  Socialist Realism   
10  tt0077684     movie  Histórias de Combóios em Portugal   
11  tt0096235     movie                        Taxi Killer   
12  tt0097767     movie                     Loading Ludwig   

                        originalTitle isAdult  startYear endYear  \
1         Istoriya grazhdanskoy voyny       0       2021      \N   
9              El realismo socialista       0       2023      \N   
10  Histórias de Combóios em Portugal       0       2022      \N   
11                        Taxi Killer       0       2022      \N   
12                     Loading Ludwig       0       2022      \N   

   runtimeMinutes              genres  averageRating  numVotes  
1              94         Documentary            6.6      71.0  
9              78               Drama            7.5      59.0  
10             46      

In [7]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68917 entries, 1 to 4479219
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          68917 non-null  object 
 1   titleType       68917 non-null  object 
 2   primaryTitle    68917 non-null  object 
 3   originalTitle   68917 non-null  object 
 4   isAdult         68917 non-null  object 
 5   startYear       68917 non-null  Int64  
 6   endYear         68917 non-null  object 
 7   runtimeMinutes  68917 non-null  object 
 8   genres          68917 non-null  object 
 9   averageRating   32777 non-null  float64
 10  numVotes        32777 non-null  float64
dtypes: Int64(1), float64(2), object(8)
memory usage: 6.4+ MB
