In [1]:
# imports
import urllib
from zipfile import ZipFile
import pandas as pd
import json
import os


In [2]:
# function to download the data in csv format
def download_data(link, folder):
    zip_file_name = link.rsplit('/', 1)[-1]
    zip_full_path = folder + '/' + zip_file_name
    urllib.request.urlretrieve(link, filename=zip_full_path)

    with ZipFile(zip_full_path, 'r') as z_object:
        z_object.extractall(path=folder + '/')


In [3]:
link = 'https://files.grouplens.org/datasets/movielens/ml-25m.zip'
folder = 'data'
download_data(link, folder)


In [4]:
def read_secrets() -> dict:
    filename = os.path.join('secrets.json')
    try:
        with open(filename, mode='r') as f:
            return json.loads(f.read())
    except FileNotFoundError:
        return {}


In [5]:
secrets = read_secrets()


In [6]:
df_movies = pd.read_csv('data/ml-25m/movies.csv')


In [7]:
movie_id_list = df_movies['movieId'].to_list()


In [8]:
# function to generate the link to download metadata json file
def generate_link(movie_id, api_key):
    link = 'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}'.format(
        movie_id=movie_id, api_key=secrets['api_key'])
    return link


In [9]:
# function to generate file path to read the downloaded json file
def generate_file_path(movie_id, folder):
    file_path = folder + '/' + str(movie_id) + '.json'
    return file_path
    

In [10]:
# download metadata from TMD for each movie in movies.csv
folder_name = 'data/TMD-api'
api_success_list = []
for movie_id in movie_id_list:
    link = generate_link(movie_id, secrets['api_key'])
    file_path = generate_file_path(movie_id, folder_name)

    if not os.path.isfile(file_path):
        # file does not exist
        try:
            urllib.request.urlretrieve(link, filename=file_path)
            api_success_list.append(1)
        except:
            api_success_list.append(0)
    else:
        # file exists
        api_success_list.append(1)


In [11]:
df_api_download = pd.DataFrame(
    {'movie_id': movie_id_list, 'api_success': api_success_list})


In [12]:
df_api_download.to_csv('data/api-download-success.csv', index=False)


In [13]:
# get list of all json files
path = 'data/TMD-api'
json_files = os.listdir(path)


In [14]:
len(json_files)


33618

In [15]:
# function to generate file path for each json file
def generate_file_path(file, folder):
    file_path = folder + '/' + file
    return file_path


In [16]:
folder_name = 'data/TMD-api'


In [17]:
df_movies_metadata = pd.DataFrame()

for file in json_files:
    file_path = generate_file_path(file, folder_name)
    with open(file_path, 'r') as json_data:
        data = json.load(json_data)
        df_new = pd.json_normalize(data)
        df_movies_metadata = pd.concat([df_movies_metadata, df_new])


In [18]:
len(df_movies_metadata)


33618

In [19]:
df_movies_metadata.head()


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/cXQH2u7wUIX1eoIdEj51kHXoWhX.jpg,,1350000,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",http://www.universalstudiosentertainment.com/l...,100,tt0120735,en,"Lock, Stock and Two Smoking Barrels",...,Released,A Disgrace to Criminals Everywhere.,"Lock, Stock and Two Smoking Barrels",False,8.1,5798,,,,
0,False,/cbTvuGya7E1PnL8t95AWzumjqHg.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,100017,tt0488903,de,Verfolgt,...,Released,,Punish Me,False,4.6,16,,,,
0,False,,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,100032,tt0099137,en,The Great Los Angeles Earthquake,...,Released,"There is no safe harbor, there is no escape......",The Great Los Angeles Earthquake,False,6.9,13,,,,
0,False,,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 35, 'nam...",,100034,tt0462634,en,The Worst Horror Movie Ever Made,...,Released,,The Worst Horror Movie Ever Made,False,3.0,8,,,,
0,False,,,0,"[{'id': 10402, 'name': 'Music'}]",,100038,,en,Meshuggah - Nothing,...,Released,,Meshuggah - Nothing,False,4.0,2,,,,


In [20]:
df_movies_metadata.to_csv('data/movies-metadata.csv', index=False)
