In [15]:
import numpy as np
import pandas as pd
import requests
import json
import re
from datetime import datetime as dt
from tqdm import tqdm

In [2]:
api_key = 'api_key=69d03443c6dda74d5729a2be5debd168'
tmdb_api_url = 'https://api.themoviedb.org/3/'

# Genres

In [3]:
get_genres = 'genre/movie/list'
response = requests.get(tmdb_api_url + '?'.join([get_genres, api_key]))
genres = response.json()['genres']

In [4]:
genres_ls = [d['name'] for d in genres]
genres_ls.append('Foreign')
genres_ls = sorted(genres_ls)
n_genres = len(genres_ls)

# Movies and Reviews

In [5]:
tmdb = pd.read_csv('data/tmdb_5000_movies.csv')

tmdb['goodness'] = tmdb['vote_average'] * tmdb['vote_count'] / tmdb['vote_count'].max()
# threshold -> 80th percentile
threshold = np.around(np.percentile(tmdb['goodness'], 80), 2)
# encoding 1 <-> goodness >= threshold
tmdb['goodness_enc'] = 0
tmdb.loc[tmdb['goodness'] >= threshold, 'goodness_enc'] = 1

In [6]:
cols_to_keep = ['id', 'title', 'status', 'release_date', 'budget', 'revenue',
                'original_language', 'runtime', 'popularity', 'goodness_enc']
# keeping only movies that have a positive vote_count
tmdb_clean = tmdb.loc[tmdb['vote_count'] > 0, cols_to_keep].dropna(axis=0).reset_index(drop=True)

In [7]:
reviews = pd.Series(name='reviews')
df_genres = pd.DataFrame(columns=genres_ls)

In [8]:
for i, movie_id in enumerate(tqdm(tmdb_clean['id'])):
    get_reviews = f'movie/{movie_id}/reviews'
    response_reviews = requests.get(tmdb_api_url + '?'.join([get_reviews, api_key]))
    if response_reviews.status_code == 200:
        reviews.loc[i] = [d['content'] for d in response_reviews.json()['results']]
    else:
        reviews.loc[i] = []
    
    df_genres.loc[i] = np.zeros(n_genres, dtype=int)
    movie_genres = re.findall(r'\"name\": \"([A-Za-z\s]+)\"', tmdb['genres'][i])
    df_genres.loc[i, movie_genres] = 1

100%|██████████| 4740/4740 [1:09:08<00:00,  1.14it/s]  


In [9]:
display(tmdb_clean)
display(df_genres)
display(reviews)

Unnamed: 0,id,title,status,release_date,budget,revenue,original_language,runtime,popularity,goodness_enc
0,19995,Avatar,Released,2009-12-10,237000000,2787965087,en,162.0,150.437577,1
1,285,Pirates of the Caribbean: At World's End,Released,2007-05-19,300000000,961000000,en,169.0,139.082615,1
2,206647,Spectre,Released,2015-10-26,245000000,880674609,en,148.0,107.376788,1
3,49026,The Dark Knight Rises,Released,2012-07-16,250000000,1084939099,en,165.0,112.312950,1
4,49529,John Carter,Released,2012-03-07,260000000,284139100,en,132.0,43.926995,1
5,559,Spider-Man 3,Released,2007-05-01,258000000,890871626,en,139.0,115.699814,1
6,38757,Tangled,Released,2010-11-24,260000000,591794936,en,100.0,48.681969,1
7,99861,Avengers: Age of Ultron,Released,2015-04-22,280000000,1405403694,en,141.0,134.279229,1
8,767,Harry Potter and the Half-Blood Prince,Released,2009-07-07,250000000,933959197,en,153.0,98.885637,1
9,209112,Batman v Superman: Dawn of Justice,Released,2016-03-23,250000000,873260194,en,151.0,155.790452,1


Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
8,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
9,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


0       [Avatar is an overwhelming, immersive spectacl...
1                                                      []
2       [<a href="http://www.cutprintfilm.com/reviews/...
3       [I felt like this was a tremendous end to Nola...
4       [Totally forgettable and full of stereotypes, ...
5       [After the success of the first two Spider-Man...
6       [Tangled main characters really remember Aladd...
7       [There is a lot going on in this movie but it ...
8       [Hormones over excitement as part six is merel...
9       [Awesome moview. Best Action sequence.\r\n\r\n...
10                                                     []
11      [**It's a shame that they did this to the Bond...
12                                                     []
13      [Wrong Brother.\r\n\r\nThe early signs were no...
14      [Overall this movie is great! It has flaws but...
15                                                     []
16      [With a movie like this you wonder how all of ...
17      [More 

In [53]:
df = pd.concat([tmdb_clean, df_genres, reviews], axis=1).sort_values('id').reset_index(drop=True)

# turning dtype of release_date to datetime and extracting only the year
df['release_date'] = pd.to_datetime(df['release_date'], infer_datetime_format=True).dt.year
df = df.rename(columns={'release_date': 'release_year'})

# keeping only movies that have been released
df = df[df['status'] == 'Released'].reset_index(drop=True)
details = df.drop(['status', 'reviews', 'Foreign', 'TV Movie'], axis=1)

details.to_csv('data/details.csv', index=False)

ValueError: The column label 'id' is not unique.

In [59]:
reviews = df[['id', 'title', 'reviews']]

reviews.to_csv('data/reviews.csv', index=False)