In [105]:
import pandas as pd
import os
import requests
import json
import time

In [106]:
def get_token():
    with open("token.json", "r") as f:
        data = json.load(f)
    return data["token"]

In [107]:
TOKEN = get_token()

In [108]:
def api_request(url):
    headers = {
        "accept": "application/json",
        "Authorization": TOKEN
    }
    
    cnt = 0
    while cnt <= 5:
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == requests.codes.ok:
                return response.text
        except requests.exceptions.RequestException as e:
            time.sleep(0.5)
            cnt += 1
    raise Exception("request failed!")

In [109]:
def get_tmdb_info(imdbId):
    data_raw = api_request(f"https://api.themoviedb.org/3/find/{imdbId}?external_source=imdb_id")
    data = json.loads(data_raw)
    try:
        tmdbId = data['movie_results'][0]['id']
    except:
        return (0, 0, 0, 0, 0)
    
    data_raw = api_request(f"https://api.themoviedb.org/3/movie/{tmdbId}?language=en-US") 
    data = json.loads(data_raw)
    try:
        print(data['original_title'])
        
        adult = data['adult']
        budget = data['budget']
        revenue = data['revenue']
        voteAverage = data['vote_average']
        voteCount = data['vote_count']
    except:
        return (0, 0, 0, 0, 0)
    
    return adult, budget, revenue, voteAverage, voteCount

In [110]:
get_tmdb_info("tt0152267")

(0, 0, 0, 0, 0)

In [111]:
# do IMDb stuff
imdb_ratings = pd.read_csv(f'raw/title.ratings.tsv', sep='\t', na_values="\\N")
imdb_titles = pd.read_csv(f'raw/title.basics.tsv', sep='\t', na_values="\\N")

imdb_merged = pd.merge(
    imdb_titles[imdb_titles['titleType'] == 'movie'],
    imdb_ratings,
    on='tconst',
).fillna({'averageRating': -1, 'numVotes': -1, 'runtimeMinutes': -1, 'genres': ''})

  imdb_titles = pd.read_csv(f'raw/title.basics.tsv', sep='\t', na_values="\\N")


In [112]:
imdb_merged = imdb_merged[
    (imdb_merged['averageRating'] != -1) &
    (imdb_merged["numVotes"] != -1) &
    (imdb_merged["runtimeMinutes"] != 1)
]

In [113]:
imdb_merged['startYear'] = imdb_merged['startYear'].astype(str)
imdb_merged['startYear'] = imdb_merged['startYear'].str[:4]

In [114]:
imdb_merged = imdb_merged.rename(columns={"tconst": "imdbId", "averageRating": "imdbScore", "numVotes": "imdbNumVotes"})

In [136]:
imdb_merged.head()

Unnamed: 0,imdbId,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,imdbScore,imdbNumVotes
0,tt0000009,movie,Miss Jerry,Miss Jerry,0.0,1894,,45.0,Romance,5.4,218
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897,,100.0,"Documentary,News,Sport",5.3,549
2,tt0000502,movie,Bohemios,Bohemios,0.0,1905,,100.0,,3.8,20
3,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906,,70.0,"Action,Adventure,Biography",6.0,969
4,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0.0,1907,,90.0,Drama,5.6,30


In [177]:
# Rotten Tomatoes stuff
reviews_df = pd.read_csv(f"raw/rotten_tomatoes_movie_reviews.csv")
movies_df = pd.read_csv(f"raw/rotten_tomatoes_movies.csv")

tomatoes_merged = pd.merge(
    movies_df[['id', 'title', 'audienceScore', 'tomatoMeter', 'releaseDateTheaters']],
    reviews_df[['id', 'creationDate', 'reviewState']],
    on='id'
).fillna({'tomatoMeter': -1, 'audienceScore': -1, 'releaseDateTheaters': -1})

In [178]:
tomatoes_merged['releaseYear'] = tomatoes_merged['releaseDateTheaters'].str[:4]

In [179]:
tomatoes_merged = tomatoes_merged[
    (tomatoes_merged['tomatoMeter'] != -1) &
    (tomatoes_merged['audienceScore'] != -1) &
    (tomatoes_merged['releaseDateTheaters'] != -1)
]

In [180]:
# Convert reviewState to numerical values
tomatoes_merged["score"] = tomatoes_merged["reviewState"].map({"fresh": 10, "rotten": 0})

# Extract year from creationDate
tomatoes_merged["year"] = pd.to_datetime(tomatoes_merged["creationDate"]).dt.year

# Group by id and year, compute the average score and count reviews
yearly_data = tomatoes_merged.groupby(["id", "year"]).agg(
    avg_score=("score", "mean"),
    num_reviews=("score", "count")
).reset_index()

# Round scores and convert to integers
yearly_data["avg_score"] = yearly_data["avg_score"].round().astype(int)

# Convert to dictionary format
score_dicts = yearly_data.groupby("id").apply(
    lambda x: json.dumps(dict(zip(x["year"], x["avg_score"])), separators=(",", ":"))
).reset_index()

review_count_dicts = yearly_data.groupby("id").apply(
    lambda x: json.dumps(dict(zip(x["year"], x["num_reviews"])), separators=(",", ":"))
).reset_index()

# Compute total number of reviews for each movie
total_reviews = tomatoes_merged.groupby("id")["score"].count().reset_index()
total_reviews.rename(columns={"score": "rottenTomatoesNumVotes"}, inplace=True)

# Merge with unique movie data
unique_movies = tomatoes_merged.drop_duplicates(subset=["id"])[
    ["id", "title", "audienceScore", "tomatoMeter", "releaseDateTheaters", "releaseYear"]
]

tomatoes_new = unique_movies.merge(score_dicts, on="id").merge(review_count_dicts, on="id").merge(total_reviews, on="id")
tomatoes_new.rename(columns={0: "yearly_scores", 1: "yearlyNumReviews"}, inplace=True)

  score_dicts = yearly_data.groupby("id").apply(
  review_count_dicts = yearly_data.groupby("id").apply(


In [181]:
tomatoes_new = tomatoes_new.rename(columns={"0_x": "rottenTomatoesReviews", "0_y": "rottenTomatoesReviewsNumVotes"})

In [182]:
tomatoes_new.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,releaseDateTheaters,releaseYear,rottenTomatoesReviews,rottenTomatoesReviewsNumVotes,rottenTomatoesNumVotes
0,adrift_2018,Adrift,65.0,69.0,2018-06-01,2018,"{""2018"":0,""2019"":7,""2020"":5,""2021"":5,""2022"":10}","{""2018"":4,""2019"":19,""2020"":11,""2021"":4,""2022"":2}",40
1,1035316-born_to_kill,Born to Kill,74.0,83.0,1947-04-30,1947,"{""2003"":0,""2005"":10,""2006"":10,""2016"":10,""2020""...","{""2003"":1,""2005"":1,""2006"":2,""2016"":1,""2020"":1}",6
2,1221483-paa,Paa,67.0,50.0,2009-12-04,2009,"{""2009"":6,""2017"":0,""2019"":0,""2020"":10}","{""2009"":5,""2017"":1,""2019"":1,""2020"":1}",8
3,sarah_palin_you_betcha,Sarah Palin: You Betcha!,61.0,32.0,2011-09-30,2011,"{""2011"":3}","{""2011"":31}",31
4,a_state_of_mind_2005,A State of Mind,92.0,89.0,2005-08-10,2005,"{""2005"":10,""2006"":10,""2007"":3}","{""2005"":22,""2006"":2,""2007"":3}",27


In [183]:
filtered_df = pd.merge(
    imdb_merged,
    tomatoes_new,
    left_on=['primaryTitle', 'startYear'],
    right_on=['title', 'releaseYear'],
    how='inner'
)

In [184]:
filtered_df = filtered_df.drop(columns=['primaryTitle', 'releaseYear', 'titleType', 'isAdult', 'endYear', 'id', 'originalTitle'])

In [185]:
filtered_df = filtered_df.rename(columns={"tconst": "imdbId"})

In [186]:
filtered_df.head()

Unnamed: 0,imdbId,startYear,runtimeMinutes,genres,imdbScore,imdbNumVotes,title,audienceScore,tomatoMeter,releaseDateTheaters,rottenTomatoesReviews,rottenTomatoesReviewsNumVotes,rottenTomatoesNumVotes
0,tt0004707,1914,82.0,Comedy,6.2,3791,Tillie's Punctured Romance,42.0,90.0,1914-11-14,"{""2007"":10,""2008"":10,""2009"":5,""2010"":10,""2020""...","{""2007"":2,""2008"":1,""2009"":2,""2010"":1,""2020"":3,...",10
1,tt0004972,1915,195.0,"Drama,War",6.1,27113,The Birth of a Nation,54.0,91.0,1915-03-03,"{""2000"":10,""2001"":10,""2002"":10,""2003"":8,""2004""...","{""2000"":1,""2001"":2,""2002"":6,""2003"":5,""2004"":5,...",42
2,tt0005078,1915,59.0,"Drama,Romance",6.5,2913,The Cheat,47.0,90.0,1915-12-12,"{""2004"":10,""2005"":10,""2006"":0,""2008"":10,""2019""...","{""2004"":1,""2005"":2,""2006"":1,""2008"":1,""2019"":4,...",10
3,tt0005960,1915,72.0,"Biography,Crime,Drama",6.8,1484,The Regeneration,69.0,100.0,1915-09-13,"{""2005"":10,""2013"":10}","{""2005"":1,""2013"":2}",3
4,tt0006864,1916,163.0,"Drama,History",7.7,17240,Intolerance,77.0,97.0,1916-09-05,"{""2000"":10,""2002"":7,""2003"":10,""2004"":10,""2005""...","{""2000"":2,""2002"":3,""2003"":4,""2004"":6,""2005"":2,...",38


In [124]:
filtered_df.to_csv("temp.csv")

In [125]:
# TMDB STUFF

In [None]:
# tmdb = filtered_df[['imdbId']]

# print(f"fetching api data for {tmdb.shape[0]} movies")
# tmdb['raw'] = tmdb['imdbId'].apply(get_tmdb_info)

# tmdb['isAdult'], tmdb['budget'], tmdb['revenue'], tmdb['tmdbAverageScore'], tmdb['tmdbNumVotes'] = zip(*tmdb.raw)
# tmdb = tmdb.drop(columns=['raw'])

# tmdb = tmdb[
#     (tmdb['budget'] != 0) &
#     (tmdb['revenue'] != 0)
# ]

# print(f"tmdb shape: {tmdb.shape}")
# tmdb.head()
# tmdb.to_csv("tmbd.csv")

tmdb = pd.read_csv("processed/tmbd.csv")

fetching api data for 12268 movies
Tillie's Punctured Romance
The Birth of a Nation
The Cheat
The Regeneration
Intolerance: Love's Struggle Throughout the Ages
Shoes
Tarzan of the Apes
Blind Husbands
Broken Blossoms
True Heart Susie
Dr. Jekyll and Mr. Hyde
The Last of the Mohicans
The Penalty
The Saphead
Way Down East
Within Our Gates
The Blot
The Kid
Orphans of the Storm
The Sheik
The Three Musketeers
Beyond the Rocks
Blood and Sand
Foolish Wives
Grandma's Boy
Nanook of the North
Sherlock Holmes
The Hunchback of Notre Dame
Our Hospitality
The Ten Commandments
America
The Iron Horse
Der letzte Mann
The Marriage Circle
The Navigator
Стачка
The Thief of Bagdad
Die Abenteuer des Prinzen Achmed
The Big Parade
Body and Soul
The Freshman
Go West
The Gold Rush
Grass: A Nation's Battle for Life
Greed
Lady Windermere's Fan
The Lost World
The Phantom of the Opera
Sally of the Sawdust
Seven Chances
The Unholy Three
Faust – Eine deutsche Volkssage
Flesh and the Devil
Metropolis
Moana
The Cat and t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmdb['raw'] = tmdb['imdbId'].apply(get_tmdb_info)


In [187]:
final_df = pd.merge(
    filtered_df,
    tmdb,
    how="inner",
    on="imdbId",
)

In [188]:
print(f"final shape: {final_df.shape}")
final_df.head()

final shape: (5607, 18)


Unnamed: 0,imdbId,startYear,runtimeMinutes,genres,imdbScore,imdbNumVotes,title,audienceScore,tomatoMeter,releaseDateTheaters,rottenTomatoesReviews,rottenTomatoesReviewsNumVotes,rottenTomatoesNumVotes,isAdult,budget,revenue,tmdbAverageScore,tmdbNumVotes
0,tt0004972,1915,195.0,"Drama,War",6.1,27113,The Birth of a Nation,54.0,91.0,1915-03-03,"{""2000"":10,""2001"":10,""2002"":10,""2003"":8,""2004""...","{""2000"":1,""2001"":2,""2002"":6,""2003"":5,""2004"":5,...",42,False,100000,11000000,6.0,542
1,tt0005078,1915,59.0,"Drama,Romance",6.5,2913,The Cheat,47.0,90.0,1915-12-12,"{""2004"":10,""2005"":10,""2006"":0,""2008"":10,""2019""...","{""2004"":1,""2005"":2,""2006"":1,""2008"":1,""2019"":4,...",10,False,17311,137365,5.9,64
2,tt0006864,1916,163.0,"Drama,History",7.7,17240,Intolerance,77.0,97.0,1916-09-05,"{""2000"":10,""2002"":7,""2003"":10,""2004"":10,""2005""...","{""2000"":2,""2002"":3,""2003"":4,""2004"":6,""2005"":2,...",38,False,385907,1750000,7.1,341
3,tt0012349,1921,68.0,"Comedy,Drama,Family",8.2,139644,The Kid,95.0,100.0,1921-01-21,"{""2003"":10,""2004"":10,""2005"":10,""2006"":10,""2008...","{""2003"":1,""2004"":10,""2005"":3,""2006"":3,""2008"":3...",50,False,250000,2500000,8.2,2137
4,tt0013140,1922,117.0,"Drama,Thriller",7.0,4163,Foolish Wives,70.0,89.0,1922-01-11,"{""2000"":10,""2003"":10,""2004"":10,""2005"":10,""2006...","{""2000"":1,""2003"":1,""2004"":3,""2005"":2,""2006"":1,...",18,False,1100000,400200,6.6,90


In [None]:
final_df.to_csv('processed/movies.csv')