In [1]:
# imports
import concurrent.futures
import itertools
import json
import os
import urllib
from zipfile import ZipFile

import numpy as np
import pandas as pd
from tqdm.auto import tqdm


In [2]:
# function to download the data in csv format
def download_data(link, folder):
    zip_file_name = link.rsplit('/', 1)[-1]
    zip_full_path = folder + '/' + zip_file_name
    urllib.request.urlretrieve(link, filename=zip_full_path)

    with ZipFile(zip_full_path, 'r') as z_object:
        z_object.extractall(path=folder + '/')


In [3]:
link = 'https://files.grouplens.org/datasets/movielens/ml-25m.zip'
folder = 'data'
download_data(link, folder)


In [4]:
def read_secrets() -> dict:
    filename = os.path.join('secrets.json')
    try:
        with open(filename, mode='r') as f:
            return json.loads(f.read())
    except FileNotFoundError:
        return {}


In [5]:
secrets = read_secrets()


In [6]:
df_movies = pd.read_csv('data/ml-25m/movies.csv')


In [7]:
movie_id_list = df_movies['movieId'].to_list()


In [8]:
# function to generate the link to download metadata json file
def generate_metadata_link(movie_id, api_key):
    link = 'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}'.format(
        movie_id=movie_id, api_key=secrets['api_key'])
    return link


In [9]:
# function to generate file path to write the downloaded json file to
def generate_write_file_path(movie_id, folder):
    file_path = folder + '/' + str(movie_id) + '.json'
    return file_path
    

In [10]:
folder_name = 'data/TMD-api/movies'
api_success_list = []


In [11]:
# function that checks if json file exists, else try to retrieve it from TMD api
def do_operation(movie_id, folder_name=folder_name):
    try:
        link = generate_metadata_link(movie_id, secrets['api_key'])
        file_path = generate_write_file_path(movie_id, folder_name)

        if not os.path.isfile(file_path):
            # file does not exist
            try:
                urllib.request.urlretrieve(link, filename=file_path)
                api_success_list.append(1)
            except:
                api_success_list.append(0)
        else:
            # file exists
            api_success_list.append(1)
    except:
        print(str(movie_id) + 'failed')
        

In [12]:
# function to perform above function for a list of movie ids
def do_multiple_operations(movie_id_many, folder_name=folder_name):
    for movie_id in movie_id_many:
        do_operation(movie_id, folder_name)
        

In [13]:
# function to create groups of n size
def grouper(iterable, n, *, incomplete='fill', fillvalue=None):
    "Collect data into non-overlapping fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, fillvalue='x') --> ABC DEF Gxx
    # grouper('ABCDEFG', 3, incomplete='strict') --> ABC DEF ValueError
    # grouper('ABCDEFG', 3, incomplete='ignore') --> ABC DEF
    args = [iter(iterable)] * n
    if incomplete == 'fill':
        return itertools.zip_longest(*args, fillvalue=fillvalue)
    if incomplete == 'strict':
        return zip(*args, strict=True)
    if incomplete == 'ignore':
        return zip(*args)
    else:
        raise ValueError('Expected fill, strict, or ignore')


In [14]:
# create executor with 16 threads
executor = concurrent.futures.ThreadPoolExecutor(16)

# perform operations for batches
futures = [executor.submit(do_multiple_operations(group), group)
           for group in grouper(movie_id_list, 2000)]
concurrent.futures.wait(futures)


DoneAndNotDoneFutures(done={<Future at 0x20f9619f010 state=finished raised TypeError>, <Future at 0x20f96828040 state=finished raised TypeError>, <Future at 0x20f9619e860 state=finished raised TypeError>, <Future at 0x20f9619f0a0 state=finished raised TypeError>, <Future at 0x20f96828130 state=finished raised TypeError>, <Future at 0x20f9619f940 state=finished raised TypeError>, <Future at 0x20f9619f970 state=finished raised TypeError>, <Future at 0x20f9619e9e0 state=finished raised TypeError>, <Future at 0x20f96828220 state=finished raised TypeError>, <Future at 0x20f96828a60 state=finished raised TypeError>, <Future at 0x20f9619f2e0 state=finished raised TypeError>, <Future at 0x20f96828370 state=finished raised TypeError>, <Future at 0x20f9619fb80 state=finished raised TypeError>, <Future at 0x20f968283a0 state=finished raised TypeError>, <Future at 0x20f9619ebc0 state=finished raised TypeError>, <Future at 0x20f93cf1bd0 state=finished raised TypeError>, <Future at 0x20f9619ebf0 sta

In [15]:
api_success_list = api_success_list[:len(movie_id_list)]


In [16]:
df_movies_api_download = pd.DataFrame(
    {'movie_id': movie_id_list, 'api_success': api_success_list})


In [17]:
df_movies_api_download.to_csv(
    'data/TMD-api/movies-api-download-success-multiprocess.csv', index=False)


In [18]:
# get list of all json files in movies folder
path = 'data/TMD-api/movies'
movies_json_files = os.listdir(path)


In [19]:
len(movies_json_files)


33618

In [20]:
# function to generate file path to read the json file from
def generate_read_file_path(file, folder):
    file_path = folder + '/' + file
    return file_path


In [21]:
folder_name = 'data/TMD-api/movies'


In [22]:
df_movies_metadata = pd.DataFrame()

for file in tqdm(movies_json_files):
    file_path = generate_read_file_path(file, folder_name)
    with open(file_path, 'r') as json_data:
        data = json.load(json_data)
        df_new = pd.json_normalize(data)
        df_movies_metadata = pd.concat([df_movies_metadata, df_new])


  0%|          | 0/33618 [00:00<?, ?it/s]

In [23]:
len(df_movies_metadata)


33618

In [24]:
df_movies_metadata.head()


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/cXQH2u7wUIX1eoIdEj51kHXoWhX.jpg,,1350000,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",http://www.universalstudiosentertainment.com/l...,100,tt0120735,en,"Lock, Stock and Two Smoking Barrels",...,Released,A Disgrace to Criminals Everywhere.,"Lock, Stock and Two Smoking Barrels",False,8.1,5798,,,,
0,False,/cbTvuGya7E1PnL8t95AWzumjqHg.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,100017,tt0488903,de,Verfolgt,...,Released,,Punish Me,False,4.6,16,,,,
0,False,,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,100032,tt0099137,en,The Great Los Angeles Earthquake,...,Released,"There is no safe harbor, there is no escape......",The Great Los Angeles Earthquake,False,6.9,13,,,,
0,False,,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 35, 'nam...",,100034,tt0462634,en,The Worst Horror Movie Ever Made,...,Released,,The Worst Horror Movie Ever Made,False,3.0,8,,,,
0,False,,,0,"[{'id': 10402, 'name': 'Music'}]",,100038,,en,Meshuggah - Nothing,...,Released,,Meshuggah - Nothing,False,4.0,2,,,,


In [25]:
df_movies_metadata.to_csv('data/movies-metadata.csv', index=False)


In [26]:
# function to generate the link to download credits json file
def generate_credits_link(movie_id, api_key):
    link = 'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}'.format(
        movie_id=movie_id, api_key=secrets['api_key'])
    return link


In [27]:
folder_name = 'data/TMD-api/credits'
api_success_list = []


In [28]:
# create executor with 16 threads
executor = concurrent.futures.ThreadPoolExecutor(16)

# perform operations for batches
futures = [executor.submit(do_multiple_operations(group), group)
           for group in grouper(movie_id_list, 2000)]
concurrent.futures.wait(futures)


DoneAndNotDoneFutures(done={<Future at 0x20f986d7820 state=finished raised TypeError>, <Future at 0x20f986d5030 state=finished raised TypeError>, <Future at 0x20f986d6050 state=finished raised TypeError>, <Future at 0x20f986d7850 state=finished raised TypeError>, <Future at 0x20f986d6890 state=finished raised TypeError>, <Future at 0x20f986d60e0 state=finished raised TypeError>, <Future at 0x20f986d4940 state=finished raised TypeError>, <Future at 0x20f986d49d0 state=finished raised TypeError>, <Future at 0x20f986d69e0 state=finished raised TypeError>, <Future at 0x20f986d6a10 state=finished raised TypeError>, <Future at 0x20f986d7220 state=finished raised TypeError>, <Future at 0x20f986d6ad0 state=finished raised TypeError>, <Future at 0x20f986d5b70 state=finished raised TypeError>, <Future at 0x20f986d6380 state=finished raised TypeError>, <Future at 0x20f986d63e0 state=finished raised TypeError>, <Future at 0x20f986d5420 state=finished raised TypeError>, <Future at 0x20f9619fc40 sta

In [29]:
api_success_list = api_success_list[:len(movie_id_list)]


In [30]:
df_credits_api_download = pd.DataFrame(
    {'movie_id': movie_id_list, 'api_success': api_success_list})


In [31]:
df_credits_api_download.to_csv(
    'data/TMD-api/credits-api-download-success.csv', index=False)


In [32]:
# get list of all json files in credits folder
path = 'data/TMD-api/credits'
credits_json_files = os.listdir(path)


In [33]:
len(credits_json_files)


33618

In [34]:
folder_name = 'data/TMD-api/credits'


In [35]:
df_credits = pd.DataFrame()

for file in tqdm(credits_json_files):
    file_path = generate_read_file_path(file, folder_name)
    with open(file_path, 'r') as json_data:
        data = json.load(json_data)
        df_new = pd.json_normalize(data)
        df_credits = pd.concat([df_credits, df_new])


  0%|          | 0/33618 [00:00<?, ?it/s]

In [36]:
len(df_credits)


33618

In [37]:
df_credits.head()


Unnamed: 0,id,cast,crew
0,100,"[{'adult': False, 'gender': 2, 'id': 973, 'kno...","[{'adult': False, 'gender': 2, 'id': 960, 'kno..."
0,100017,"[{'adult': False, 'gender': 2, 'id': 5202, 'kn...","[{'adult': False, 'gender': 1, 'id': 2338, 'kn..."
0,100032,"[{'adult': False, 'gender': 1, 'id': 87038, 'k...","[{'adult': False, 'gender': 2, 'id': 36116, 'k..."
0,100034,"[{'adult': False, 'gender': 0, 'id': 1022808, ...","[{'adult': False, 'gender': 2, 'id': 99005, 'k..."
0,100038,[],[]


In [38]:
df_credits.to_csv('data/credits.csv', index=False)


In [39]:
# function to generate the link to download keywords json file
def generate_keywords_link(movie_id, api_key):
    link = 'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}'.format(
        movie_id=movie_id, api_key=secrets['api_key'])
    return link


In [40]:
folder_name = 'data/TMD-api/keywords'
api_success_list = []


In [41]:
# create executor with 16 threads
executor = concurrent.futures.ThreadPoolExecutor(16)

# perform operations for batches
futures = [executor.submit(do_multiple_operations(group), group)
           for group in grouper(movie_id_list, 2000)]
concurrent.futures.wait(futures)


DoneAndNotDoneFutures(done={<Future at 0x20fa16b5810 state=finished raised TypeError>, <Future at 0x20f9619e830 state=finished raised TypeError>, <Future at 0x20fa16b58a0 state=finished raised TypeError>, <Future at 0x20fa16b58d0 state=finished raised TypeError>, <Future at 0x20fa16b60e0 state=finished raised TypeError>, <Future at 0x20fa16b59c0 state=finished raised TypeError>, <Future at 0x20fa16b49d0 state=finished raised TypeError>, <Future at 0x20fa16b59f0 state=finished raised TypeError>, <Future at 0x20fa16b5a20 state=finished raised TypeError>, <Future at 0x20fa16b5a80 state=finished raised TypeError>, <Future at 0x20fa16b5ab0 state=finished raised TypeError>, <Future at 0x20fa16b5b40 state=finished raised TypeError>, <Future at 0x20fa16b43a0 state=finished raised TypeError>, <Future at 0x20fa16b5bd0 state=finished raised TypeError>, <Future at 0x20fa16b43d0 state=finished raised TypeError>, <Future at 0x20fa16b5c00 state=finished raised TypeError>, <Future at 0x20fa16b4490 sta

In [42]:
api_success_list = api_success_list[:len(movie_id_list)]


In [43]:
df_keywords_api_download = pd.DataFrame(
    {'movie_id': movie_id_list, 'api_success': api_success_list})


In [44]:
df_keywords_api_download.to_csv(
    'data/TMD-api/keywords-api-download-success.csv', index=False)


In [45]:
# get list of all json files in keywords folder
path = 'data/TMD-api/keywords'
keywords_json_files = os.listdir(path)


In [46]:
len(keywords_json_files)


33618

In [47]:
folder_name = 'data/TMD-api/keywords'


In [48]:
df_keywords = pd.DataFrame()

for file in tqdm(keywords_json_files):
    file_path = generate_read_file_path(file, folder_name)
    with open(file_path, 'r') as json_data:
        data = json.load(json_data)
        df_new = pd.json_normalize(data)
        df_keywords = pd.concat([df_keywords, df_new])


  0%|          | 0/33618 [00:00<?, ?it/s]

In [49]:
len(df_keywords)


33618

In [50]:
df_keywords.head()


Unnamed: 0,id,keywords
0,100,"[{'id': 502, 'name': 'ambush'}, {'id': 567, 'n..."
0,100017,"[{'id': 2843, 'name': 'fetishism'}, {'id': 326..."
0,100032,"[{'id': 2708, 'name': 'hitman'}, {'id': 3521, ..."
0,100034,[]
0,100038,[]


In [51]:
df_keywords.to_csv('data/keywords.csv', index=False)
