In [1]:
# imports
import urllib
from zipfile import ZipFile
import pandas as pd
import json
import os
from tqdm.auto import tqdm


In [2]:
# function to download the data in csv format
def download_data(link, folder):
    zip_file_name = link.rsplit('/', 1)[-1]
    zip_full_path = folder + '/' + zip_file_name
    urllib.request.urlretrieve(link, filename=zip_full_path)

    with ZipFile(zip_full_path, 'r') as z_object:
        z_object.extractall(path=folder + '/')


In [3]:
link = 'https://files.grouplens.org/datasets/movielens/ml-25m.zip'
folder = 'data'
download_data(link, folder)


In [4]:
def read_secrets() -> dict:
    filename = os.path.join('secrets.json')
    try:
        with open(filename, mode='r') as f:
            return json.loads(f.read())
    except FileNotFoundError:
        return {}


In [5]:
secrets = read_secrets()


In [6]:
df_movies = pd.read_csv('data/ml-25m/movies.csv')


In [7]:
movie_id_list = df_movies['movieId'].to_list()


In [8]:
# function to generate the link to download metadata json file
def generate_metadata_link(movie_id, api_key):
    link = 'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}'.format(
        movie_id=movie_id, api_key=secrets['api_key'])
    return link


In [9]:
# function to generate file path to write the downloaded json file to
def generate_write_file_path(movie_id, folder):
    file_path = folder + '/' + str(movie_id) + '.json'
    return file_path
    

In [10]:
# download metadata from TMD for each movie in movies.csv
folder_name = 'data/TMD-api/movies'
movies_api_success_list = []
for movie_id in tqdm(movie_id_list):
    link = generate_metadata_link(movie_id, secrets['api_key'])
    file_path = generate_write_file_path(movie_id, folder_name)

    if not os.path.isfile(file_path):
        # file does not exist
        try:
            urllib.request.urlretrieve(link, filename=file_path)
            movies_api_success_list.append(1)
        except:
            movies_api_success_list.append(0)
    else:
        # file exists
        movies_api_success_list.append(1)


  0%|          | 0/62423 [00:00<?, ?it/s]

In [11]:
df_movies_api_download = pd.DataFrame(
    {'movie_id': movie_id_list, 'api_success': movies_api_success_list})


In [12]:
df_movies_api_download.to_csv(
    'data/TMD-api/movies-api-download-success.csv', index=False)


In [13]:
# get list of all json files in movies folder
path = 'data/TMD-api/movies'
movies_json_files = os.listdir(path)


In [14]:
len(movies_json_files)


33618

In [15]:
# function to generate file path to read the json file from
def generate_read_file_path(file, folder):
    file_path = folder + '/' + file
    return file_path


In [16]:
folder_name = 'data/TMD-api/movies'


In [17]:
df_movies_metadata = pd.DataFrame()

for file in tqdm(movies_json_files):
    file_path = generate_read_file_path(file, folder_name)
    with open(file_path, 'r') as json_data:
        data = json.load(json_data)
        df_new = pd.json_normalize(data)
        df_movies_metadata = pd.concat([df_movies_metadata, df_new])


  0%|          | 0/33618 [00:00<?, ?it/s]

In [18]:
len(df_movies_metadata)


33618

In [19]:
df_movies_metadata.head()


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/cXQH2u7wUIX1eoIdEj51kHXoWhX.jpg,,1350000,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",http://www.universalstudiosentertainment.com/l...,100,tt0120735,en,"Lock, Stock and Two Smoking Barrels",...,Released,A Disgrace to Criminals Everywhere.,"Lock, Stock and Two Smoking Barrels",False,8.1,5798,,,,
0,False,/cbTvuGya7E1PnL8t95AWzumjqHg.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,100017,tt0488903,de,Verfolgt,...,Released,,Punish Me,False,4.6,16,,,,
0,False,,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,100032,tt0099137,en,The Great Los Angeles Earthquake,...,Released,"There is no safe harbor, there is no escape......",The Great Los Angeles Earthquake,False,6.9,13,,,,
0,False,,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 35, 'nam...",,100034,tt0462634,en,The Worst Horror Movie Ever Made,...,Released,,The Worst Horror Movie Ever Made,False,3.0,8,,,,
0,False,,,0,"[{'id': 10402, 'name': 'Music'}]",,100038,,en,Meshuggah - Nothing,...,Released,,Meshuggah - Nothing,False,4.0,2,,,,


In [20]:
df_movies_metadata.to_csv('data/movies-metadata.csv', index=False)


In [21]:
# function to generate the link to download credits json file
def generate_credits_link(movie_id, api_key):
    link = 'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}'.format(
        movie_id=movie_id, api_key=secrets['api_key'])
    return link


In [22]:
# download credits from TMD for each movie in movies.csv
folder_name = 'data/TMD-api/credits'
credits_api_success_list = []
for movie_id in tqdm(movie_id_list):
    link = generate_credits_link(movie_id, secrets['api_key'])
    file_path = generate_write_file_path(movie_id, folder_name)

    if not os.path.isfile(file_path):
        # file does not exist
        try:
            urllib.request.urlretrieve(link, filename=file_path)
            credits_api_success_list.append(1)
        except:
            credits_api_success_list.append(0)
    else:
        # file exists
        credits_api_success_list.append(1)


  0%|          | 0/62423 [00:00<?, ?it/s]

In [23]:
df_credits_api_download = pd.DataFrame(
    {'movie_id': movie_id_list, 'api_success': credits_api_success_list})


In [24]:
df_credits_api_download.to_csv(
    'data/TMD-api/credits-api-download-success.csv', index=False)


In [25]:
# get list of all json files in credits folder
path = 'data/TMD-api/credits'
credits_json_files = os.listdir(path)


In [26]:
len(credits_json_files)


33618

In [27]:
folder_name = 'data/TMD-api/credits'


In [28]:
df_credits = pd.DataFrame()

for file in tqdm(credits_json_files):
    file_path = generate_read_file_path(file, folder_name)
    with open(file_path, 'r') as json_data:
        data = json.load(json_data)
        df_new = pd.json_normalize(data)
        df_credits = pd.concat([df_credits, df_new])


  0%|          | 0/33618 [00:00<?, ?it/s]

In [29]:
len(df_credits)


33618

In [30]:
df_credits.head()


Unnamed: 0,id,cast,crew
0,100,"[{'adult': False, 'gender': 2, 'id': 973, 'kno...","[{'adult': False, 'gender': 2, 'id': 960, 'kno..."
0,100017,"[{'adult': False, 'gender': 2, 'id': 5202, 'kn...","[{'adult': False, 'gender': 1, 'id': 2338, 'kn..."
0,100032,"[{'adult': False, 'gender': 1, 'id': 87038, 'k...","[{'adult': False, 'gender': 2, 'id': 36116, 'k..."
0,100034,"[{'adult': False, 'gender': 0, 'id': 1022808, ...","[{'adult': False, 'gender': 2, 'id': 99005, 'k..."
0,100038,[],[]


In [31]:
df_credits.to_csv('data/credits.csv', index=False)
