In [1]:
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import gzip
import requests
import os
display(HTML("<style>.container { width:90% !important; }</style>"))

Downloadd and prepare film data for recommendations

In [2]:
def download_data(url, name_to_save):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(name_to_save, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

In [3]:
#download all imdb datasets
download_data('https://datasets.imdbws.com/title.basics.tsv.gz', 'data/film_data/movie_basics.gz') #film basic metadata
download_data('https://datasets.imdbws.com/title.ratings.tsv.gz', 'data/film_data/movie_ratings.gz') #film avg rating and score count
download_data('https://datasets.imdbws.com/title.crew.tsv.gz', 'data/film_data/movie_crew.gz') #film director and other crew (id's only)
download_data('https://datasets.imdbws.com/name.basics.tsv.gz', 'data/film_data/names_basics.gz')#director names


In [4]:
# read Gzip TSV into dataframe function
def read_gzipped_tsv_to_dataframe(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        dataframe = pd.read_csv(file, sep='\t', low_memory=False)
    return dataframe

In [5]:
# read Gzip TSV into dataframe function
def read_gzipped_tsv_to_dataframe(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        dataframe = pd.read_csv(file, sep='\t', low_memory=False)
    return dataframe

In [18]:
movie_basics_df = read_gzipped_tsv_to_dataframe('data/film_data/movie_basics.gz')
movie_ratings_df = read_gzipped_tsv_to_dataframe('data/film_data/movie_ratings.gz')
movie_crew_df = read_gzipped_tsv_to_dataframe('data/film_data/movie_crew.gz')
names_basics_df = read_gzipped_tsv_to_dataframe('data/film_data/names_basics.gz')

In [10]:
# Basic info function

def generate_info(dataframes, names):
    for df, name in zip(dataframes, names):
        print(f"EDA for {name}")        
        print(f"Number of duplicate rows: {df.duplicated().sum()}")  
        print("\nNumber of N/A values")
        print(df.isna().sum())
        print("\nBasic Information:")
        print(df.info())        
        print("\nFirst 5 rows:")
        print(df.head())        
        print("\nSummary Statistics:")
        print(df.describe())
        
        


dataframes = [movie_crew_df, movie_ratings_df, movie_basics_df, names_basics_df]
names = ['Movie Crew', 'Movie Ratings', 'Movie Basics', 'Names Basics']

generate_info(dataframes, names)



EDA for Movie Crew
Number of duplicate rows: 0

Number of N/A values
tconst       0
directors    0
writers      0
dtype: int64

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9818209 entries, 0 to 9818208
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   tconst     object
 1   directors  object
 2   writers    object
dtypes: object(3)
memory usage: 224.7+ MB
None

First 5 rows:
      tconst  directors writers
0  tt0000001  nm0005690      \N
1  tt0000002  nm0721526      \N
2  tt0000003  nm0721526      \N
3  tt0000004  nm0721526      \N
4  tt0000005  nm0005690      \N

Summary Statistics:
           tconst directors  writers
count     9818209   9818209  9818209
unique    9818209    872725  1260406
top     tt0000001        \N       \N
freq            1   4178478  4745814
EDA for Movie Ratings
Number of duplicate rows: 0

Number of N/A values
tconst           0
averageRating    0
numVotes         0
dtype: int64

Basic Information:
<cla

In [28]:
dataframes = [movie_basics_df, movie_ratings_df, movie_crew_df, names_basics_df]

def count_n_per_column(df):
    return (df == '\\N').sum()

for i, df in enumerate(dataframes, start=1):
    counts = count_n_per_column(df)
    print(f"Dataframe {i}:")
    print(counts)
    print()

Dataframe 1:
tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear          90355
endYear           643809
runtimeMinutes         0
genres             71964
dtype: int64

Dataframe 2:
tconst           0
averageRating    0
numVotes         0
dtype: int64

Dataframe 3:
tconst             0
directors    4178478
writers      4745814
dtype: int64

Dataframe 4:
nconst                      0
primaryName                 0
birthYear            11917791
deathYear            12280516
primaryProfession           0
knownForTitles        2177791
dtype: int64



In [15]:
def replace_n_with_na(df):
    return df.replace('\\N', np.nan)

for i, df in enumerate(dataframes, start=1):
    dataframes[i-1] = replace_n_with_na(df)


In [20]:
#only gonna recommend movies for movie recommendation system
movie_basics_df = movie_basics_df[movie_basics_df['titleType']=='movie']
movie_basics_df['titleType'].value_counts()

movie    643809
Name: titleType, dtype: int64

In [23]:
#over 200k runtimes are missing, going to fill in with genres median

movie_basics_df['runtimeMinutes'] = pd.to_numeric(movie_basics_df['runtimeMinutes'], errors='coerce') #convert obj to numeric
movie_basics_df_expanded = movie_basics_df.explode('genres') #split genres
median_runtimes_by_genre = movie_basics_df_expanded.groupby('genres')['runtimeMinutes'].median() #get median of each genre
movie_basics_df = movie_basics_df.merge(median_runtimes_by_genre, on='genres', suffixes=('', '_median')) #merge both df
movie_basics_df['runtimeMinutes'].fillna(movie_basics_df['runtimeMinutes_median'], inplace=True) #fill na values with median
movie_basics_df.drop('runtimeMinutes_median', axis=1, inplace=True) #drop median column

movie_basics_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_basics_df['runtimeMinutes'] = pd.to_numeric(movie_basics_df['runtimeMinutes'], errors='coerce')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 643809 entries, 0 to 643808
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          643809 non-null  object 
 1   titleType       643809 non-null  object 
 2   primaryTitle    643809 non-null  object 
 3   originalTitle   643809 non-null  object 
 4   isAdult         643809 non-null  object 
 5   startYear       643809 non-null  object 
 6   endYear         643809 non-null  object 
 7   runtimeMinutes  643690 non-null  float64
 8   genres          643809 non-null  object 
dtypes: float64(1), object(8)
memory usage: 49.1+ MB


In [29]:
#data cleanup
movie_basics_df['startYear'] = pd.to_numeric(movie_basics_df['startYear'], errors='coerce')
movie_basics_df.drop(['originalTitle', 'endYear', 'titleType'], axis=1, inplace=True) #not useful columns
movie_basics_df.dropna(inplace=True) # drow n/a rows based on the assumption that movies with a lot of missing data arent good movies to recommend anyway
movie_crew_df.drop(['writers'], axis=1, inplace=True) #not using this
movie_crew_df.dropna(inplace=True) #no point in keeping if there are no values
names_basics_df = names_basics_df[['nconst', 'primaryName']] #drop useless column
names_basics_df.dropna(inplace=True) # drow n/a rows

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names_basics_df.dropna(inplace=True) # drow n/a rows


In [34]:
#join all dfs
movie_crew_df['first_director'] = movie_crew_df['directors'].str.split(',').str[0]
temp1 = movie_basics_df.merge(movie_ratings_df, on='tconst', how='left')
temp2 = temp1.merge(movie_crew_df, on='tconst', how='left')
merged_data = temp2.merge(names_basics_df, left_on='first_director', right_on='nconst', how='left')

In [39]:
# merged_data.isna().sum()
#since over half of the movies seem to be missing avarage rating i'll just drop those, because recomendations on them would be fairly random
merged_data.dropna(inplace=True)
merged_data.drop(['directors', 'first_director', 'nconst'], axis=1, inplace=True) #name is enough if i decide to use it


#genres are in 1 column, lets 1-hot encode them
genres_one_hot = merged_data['genres'].str.get_dummies(sep=',')
merged_data_encoded = pd.concat([merged_data, genres_one_hot], axis=1)
merged_data_encoded.drop('genres', axis=1, inplace=True)

merged_data_encoded

Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,averageRating,numVotes,primaryName,Action,Adult,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,\N
0,tt0000009,Miss Jerry,0,1894.0,45.0,5.3,204.0,Alexander Black,0,0,...,0,1,0,0,0,0,0,0,0,0
4,tt0006456,La vie de Bohème,0,1916.0,50.0,7.1,31.0,Albert Capellani,0,0,...,0,1,0,0,0,0,0,0,0,0
12,tt0007834,The Dancer's Peril,0,1917.0,50.0,6.0,20.0,Travers Vale,0,0,...,0,1,0,0,0,0,0,0,0,0
17,tt0008658,Tavasz a télben,0,1918.0,96.0,4.2,25.0,Michael Curtiz,0,0,...,0,1,0,0,0,0,0,0,0,0
18,tt0008930,Broadway Love,0,1918.0,60.0,5.8,175.0,Ida May Park,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553389,tt8363914,No Safe Spaces,0,2019.0,100.0,7.3,2591.0,Justin Folk,0,0,...,0,0,0,0,0,0,0,0,0,0
553390,tt8413518,Commanding Cue,0,2019.0,86.0,7.1,17.0,Dominic Andreozzi,0,0,...,0,0,1,0,0,0,0,0,0,0
553393,tt8896632,Max Winslow and the House of Secrets,0,2019.0,98.0,5.1,2254.0,Sean Olson,0,0,...,0,0,1,0,0,0,1,0,0,0
553395,tt9081788,"Hi, A.I.",0,2019.0,90.0,6.4,144.0,Isabella Willinger,0,0,...,0,0,1,0,0,0,0,0,0,0


In [44]:
merged_data_encoded['\\N'].value_counts() #almost 10k movies have no genre, bUt having no genre maybe useful as well

0    278152
1      9367
Name: \N, dtype: int64

In [46]:
# Data should be prety much done, so we can save it to csv and move on to the recommendation systems
merged_data_encoded.to_csv('data/film_data/prepared_film_data.csv', index=False)