In [39]:
import pandas as pd
import os
import requests
import json
import time
import re
import unicodedata

In [40]:
def get_token():
    with open("token.json", "r") as f:
        data = json.load(f)
    return data["token"]

In [41]:
TOKEN = get_token()

In [42]:
def make_request(url, headers):
    cnt = 0
    while cnt <= 5:
        try:
            response = requests.get(url, headers=headers, timeout=5)
            if response.status_code == requests.codes.ok:
                return response.text
        except:
            time.sleep(0.5)
        cnt += 1
    raise Exception("request failed!")

In [43]:
def get_tmdb_data(imdbId):
    print(imdbId)
    headers = {
        "accept": "application/json",
        "Authorization": TOKEN
    }
    data_raw = make_request(f"https://api.themoviedb.org/3/find/{imdbId}?external_source=imdb_id", headers)
    data = json.loads(data_raw)
    try:
        tmdbId = data['movie_results'][0]['id']
    except:
        return (0, 0, 0, 0, 0, 0, 0, 0)
    
    data_raw = make_request(f"https://api.themoviedb.org/3/movie/{tmdbId}?language=en-US", headers) 
    data = json.loads(data_raw)
    try:
        print(data['original_title'])
        
        productionCountries = ",".join([country['name'] for country in data["production_countries"]])
        studios = ",".join([company['name'] for company in data['production_companies']])
        originalLanguage = data['original_language']
        isAdult = data['adult']
        budget = data['budget']
        revenue = data['revenue']
        voteAverage = data['vote_average']
        voteCount = data['vote_count']
    except:
        return (0, 0, 0, 0, 0, 0, 0, 0)
    
    return productionCountries, studios, originalLanguage, isAdult, budget, revenue, voteAverage, voteCount

In [44]:
def get_letterboxd_data(imdbId):
    print(imdbId)
    headers = {
        "accept": "application/json"
    }
    data_raw = make_request(f"https://letterboxd.com/imdb/{imdbId}", headers)
    
    json_pattern = r'(\{.*?"productionCompany".*?"ratingValue".*?\}\})'
    json_match = re.search(json_pattern, data_raw)
    
    if json_match:
        try:
            json_data = json.loads(json_match.group(1))

            rating = json_data.get("aggregateRating", {}).get("ratingValue")
            numVotes = json_data.get("aggregateRating", {}).get("ratingCount")
            directors = ",".join([director['name'] for director in json_data.get("director")])
            
            return directors, rating, numVotes
        except:
            print(f"json loading failed")
            return (0, 0, 0)
    return (0, 0, 0)

In [45]:
def get_rotten_tomatoes_data(title):
    id = movie_title_to_id(title)
    
    print(f"scraping for movie {title} (id: {id})")
    
    headers = {
        "accept": "application/json"
    }
    
    try:
    
        text = make_request(f"https://www.rottentomatoes.com/m/{id}", headers)
        
        # Regex patterns to capture the average ratings and review count
        audience_rating_pattern = r'"audienceScore":\{"certifiedFresh":".*?","averageRating":"([\d\.]+)"'
        review_count_pattern = r'"criticsScore":\{.*?"reviewCount":(\d+)'

        # Search for the values in the text
        audience_match = re.search(audience_rating_pattern, text)
        review_count_match = re.search(review_count_pattern, text)

        # Extract values if matches are found
        audience_rating = audience_match.group(1) if audience_match else None
        review_count = review_count_match.group(1) if review_count_match else None
        
        audience_rating = str(20 * float(audience_rating))
        
        return id, audience_rating, review_count

    except:
        return 0, 0, 0

In [46]:
def movie_title_to_id(title):
    title = unicodedata.normalize('NFKD', title).encode('ASCII', 'ignore').decode('utf-8')
    title = title.lower()
    title = title.replace('&', 'and')
    title = title.replace('-', ' ')
    title = re.sub(r'[^a-z0-9 ]', '', title)
    title = title.replace(' ', '_')
    return title

In [47]:
get_tmdb_data("tt0152267")

tt0152267


(0, 0, 0, 0, 0, 0, 0, 0)

In [48]:
get_letterboxd_data("tt3907674")

tt3907674


(0, 0, 0)

In [49]:
get_rotten_tomatoes_data("Dune: Part Two")

scraping for movie Dune: Part Two (id: dune_part_two)


('dune_part_two', '94.0', '456')

# IMDb

In [50]:
imdb_ratings = pd.read_csv(f'raw/title.ratings.tsv', sep='\t', na_values="\\N")
imdb_titles = pd.read_csv(f'raw/title.basics.tsv', sep='\t', na_values="\\N")

imdb_merged = pd.merge(
    imdb_titles[imdb_titles['titleType'] == 'movie'],
    imdb_ratings,
    on='tconst',
).fillna({'averageRating': -1, 'numVotes': -1, 'runtimeMinutes': -1, 'genres': ''})

  imdb_titles = pd.read_csv(f'raw/title.basics.tsv', sep='\t', na_values="\\N")


In [51]:
imdb_merged = imdb_merged[
    (imdb_merged['averageRating'] != -1) &
    (imdb_merged["numVotes"] != -1) &
    (imdb_merged["runtimeMinutes"] != 1)
]

print(f"imdb titles total: {imdb_ratings.shape[0]}")
print(f"imdb movies with complete information (rating and runtime): {imdb_merged.shape[0]}")

imdb titles total: 1530505
imdb movies with complete information (rating and runtime): 324690


In [52]:
imdb_merged['startYear'] = imdb_merged['startYear'].astype(str)
imdb_merged['startYear'] = imdb_merged['startYear'].str[:4]

In [53]:
imdb_merged = imdb_merged.rename(columns={"tconst": "imdbId", "averageRating": "imdbScore", "numVotes": "imdbNumVotes"})

In [54]:
imdb_merged.head()

Unnamed: 0,imdbId,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,imdbScore,imdbNumVotes
0,tt0000009,movie,Miss Jerry,Miss Jerry,0.0,1894,,45.0,Romance,5.4,218
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897,,100.0,"Documentary,News,Sport",5.3,549
2,tt0000502,movie,Bohemios,Bohemios,0.0,1905,,100.0,,3.8,20
3,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906,,70.0,"Action,Adventure,Biography",6.0,969
4,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0.0,1907,,90.0,Drama,5.6,30


In [55]:
# IMDb movies from 2024
imdb_2024_movies = imdb_merged[imdb_merged['startYear'] == "2024"].sort_values('imdbNumVotes', ascending=False)[0:200]

In [56]:
imdb_2024_movies = imdb_2024_movies[['primaryTitle', 'startYear']]
imdb_2024_movies['raw'] = imdb_2024_movies['primaryTitle'].apply(get_rotten_tomatoes_data)

scraping for movie Dune: Part Two (id: dune_part_two)
scraping for movie Deadpool & Wolverine (id: deadpool_and_wolverine)
scraping for movie Furiosa: A Mad Max Saga (id: furiosa_a_mad_max_saga)
scraping for movie The Substance (id: the_substance)
scraping for movie Civil War (id: civil_war)
scraping for movie Alien: Romulus (id: alien_romulus)
scraping for movie The Fall Guy (id: the_fall_guy)
scraping for movie Inside Out 2 (id: inside_out_2)
scraping for movie Gladiator II (id: gladiator_ii)
scraping for movie Road House (id: road_house)
scraping for movie Longlegs (id: longlegs)
scraping for movie Twisters (id: twisters)
scraping for movie Joker: Folie à Deux (id: joker_folie_a_deux)
scraping for movie The Beekeeper (id: the_beekeeper)
scraping for movie Carry-On (id: carry_on)
scraping for movie Kingdom of the Planet of the Apes (id: kingdom_of_the_planet_of_the_apes)
scraping for movie Challengers (id: challengers)
scraping for movie A Quiet Place: Day One (id: a_quiet_place_day_

In [57]:
imdb_2024_movies.head()
imdb_2024_movies['id'], imdb_2024_movies['audienceScore'], imdb_2024_movies['rottenTomatoesNumVotes'] = zip(*imdb_2024_movies.raw)
imdb_2024_movies = imdb_2024_movies.drop(columns=['raw'])
imdb_2024_movies = imdb_2024_movies.rename(columns={"primaryTitle": "title", "startYear": "releaseYear"})

In [58]:
imdb_2024_movies.head()

Unnamed: 0,title,releaseYear,id,audienceScore,rottenTomatoesNumVotes
194498,Dune: Part Two,2024,dune_part_two,94.0,456
297617,Deadpool & Wolverine,2024,deadpool_and_wolverine,94.0,414
168336,Furiosa: A Mad Max Saga,2024,furiosa_a_mad_max_saga,86.0,419
207531,The Substance,2024,the_substance,78.0,363
206489,Civil War,2024,0,0.0,0


In [59]:
imdb_2024_movies = imdb_2024_movies[
    (imdb_2024_movies['audienceScore'] != 0) &
    (imdb_2024_movies['rottenTomatoesNumVotes'] != 0)
]

# Rotten Tomatoes

In [60]:
reviews_df = pd.read_csv(f"raw/rotten_tomatoes_movie_reviews.csv")
movies_df = pd.read_csv(f"raw/rotten_tomatoes_movies.csv")

tomatoes_merged = pd.merge(
    movies_df[['id', 'title', 'audienceScore', 'releaseDateTheaters']],
    reviews_df[['id', 'creationDate', 'reviewState']],
    on='id'
).fillna({'audienceScore': -1, 'releaseDateTheaters': -1})

In [61]:
tomatoes_merged['releaseYear'] = tomatoes_merged['releaseDateTheaters'].str[:4]

In [63]:
tomatoes_merged = tomatoes_merged[
    (tomatoes_merged['audienceScore'] != -1) &
    (tomatoes_merged['releaseDateTheaters'] != -1)
]

In [64]:
print(f"rotten tomatoes movies: {movies_df.shape[0]}")
print(f"rotten tomatoes movies with ratings: {len(tomatoes_merged['id'].unique())}")

rotten tomatoes movies: 143258
rotten tomatoes movies with ratings: 22613


In [65]:
# Convert reviewState to numerical values
tomatoes_merged["score"] = tomatoes_merged["reviewState"].map({"fresh": 10, "rotten": 0})

# Extract year from creationDate
tomatoes_merged["year"] = pd.to_datetime(tomatoes_merged["creationDate"]).dt.year

# # Group by id and year, compute the average score and count reviews
# yearly_data = tomatoes_merged.groupby(["id", "year"]).agg(
#     avg_score=("score", "mean"),
#     num_reviews=("score", "count")
# ).reset_index()

# # Round scores and convert to integers
# yearly_data["avg_score"] = yearly_data["avg_score"].round().astype(int)

# # Convert to dictionary format
# score_dicts = yearly_data.groupby("id").apply(
#     lambda x: json.dumps(dict(zip(x["year"], x["avg_score"])), separators=(",", ":"))
# ).reset_index()

# review_count_dicts = yearly_data.groupby("id").apply(
#     lambda x: json.dumps(dict(zip(x["year"], x["num_reviews"])), separators=(",", ":"))
# ).reset_index()

# Compute total number of reviews for each movie
total_reviews = tomatoes_merged.groupby("id")["score"].count().reset_index()
total_reviews.rename(columns={"score": "rottenTomatoesNumVotes"}, inplace=True)

# Merge with unique movie data
unique_movies = tomatoes_merged.drop_duplicates(subset=["id"])[
    ["id", "title", "audienceScore", "releaseDateTheaters", "releaseYear"]
]

tomatoes_new = unique_movies.merge(total_reviews, on="id")
tomatoes_new.rename(columns={0: "yearly_scores", 1: "yearlyNumReviews"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tomatoes_merged["score"] = tomatoes_merged["reviewState"].map({"fresh": 10, "rotten": 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tomatoes_merged["year"] = pd.to_datetime(tomatoes_merged["creationDate"]).dt.year


In [66]:
tomatoes_new = tomatoes_new.rename(columns={"0_x": "rottenTomatoesReviews", "0_y": "rottenTomatoesReviewsNumVotes"})

In [67]:
tomatoes_new = tomatoes_new.drop(columns='releaseDateTheaters')

In [68]:
# tomatoes_new = tomatoes_merged

In [69]:
tomatoes_new.head()

Unnamed: 0,id,title,audienceScore,releaseYear,rottenTomatoesNumVotes
0,adrift_2018,Adrift,65.0,2018,40
1,1035316-born_to_kill,Born to Kill,74.0,1947,6
2,1221483-paa,Paa,67.0,2009,8
3,sarah_palin_you_betcha,Sarah Palin: You Betcha!,61.0,2011,31
4,a_state_of_mind_2005,A State of Mind,92.0,2005,27


In [70]:
tomatoes_new.shape[0]

22613

In [71]:
tomatoes_new = pd.concat([tomatoes_new, imdb_2024_movies])

In [72]:
tomatoes_new[tomatoes_new['title'] == "Dune: Part Two"]

Unnamed: 0,id,title,audienceScore,releaseYear,rottenTomatoesNumVotes
194498,dune_part_two,Dune: Part Two,94.0,2024,456


In [73]:
filtered_df = pd.merge(
    imdb_merged,
    tomatoes_new,
    left_on=['primaryTitle', 'startYear'],
    right_on=['title', 'releaseYear'],
    how='inner'
)

In [74]:
filtered_df = filtered_df.drop(columns=['primaryTitle', 'releaseYear', 'titleType', 'isAdult', 'endYear', 'id', 'originalTitle'])

In [75]:
filtered_df = filtered_df.rename(columns={"tconst": "imdbId"})

In [76]:
print(f"number of movies from rotten tomatoes and IMDb merge data: {filtered_df.shape[0]}")

number of movies from rotten tomatoes and IMDb merge data: 15311


In [77]:
#filtered_df = filtered_df[0:10]

In [78]:
filtered_df.head()

Unnamed: 0,imdbId,startYear,runtimeMinutes,genres,imdbScore,imdbNumVotes,title,audienceScore,rottenTomatoesNumVotes
0,tt0004099,1914,59.0,"Adventure,Comedy,Family",5.3,547,"His Majesty, the Scarecrow of Oz",53.0,1
1,tt0004457,1914,81.0,"Adventure,Comedy,Family",5.4,595,The Patchwork Girl of Oz,33.0,4
2,tt0004707,1914,82.0,Comedy,6.2,3791,Tillie's Punctured Romance,42.0,10
3,tt0004972,1915,195.0,"Drama,War",6.1,27113,The Birth of a Nation,54.0,42
4,tt0005078,1915,59.0,"Drama,Romance",6.5,2913,The Cheat,47.0,10


In [79]:
filtered_df.to_csv("raw/temp.csv", index=False)

In [114]:
filtered_df[filtered_df['title'] == "Dune: Part Two"]

Unnamed: 0,imdbId,startYear,runtimeMinutes,genres,imdbScore,imdbNumVotes,title,audienceScore,rottenTomatoesNumVotes
10880,tt15239678,2024,166.0,"Action,Adventure,Drama",8.5,591800,Dune: Part Two,94.0,456


# TMDb

In [98]:
if os.path.isfile("processed/tmdb.csv"):
    print("found existing csv file")
    tmdb = pd.read_csv("processed/tmdb.csv")       
else:
    tmdb = filtered_df[['imdbId']]

    print(f"fetching api data for {tmdb.shape[0]} movies")
    tmdb['raw'] = tmdb['imdbId'].apply(get_tmdb_data)

    tmdb['productionCountries'], tmdb['studios'], tmdb['originalLanguage'], tmdb['isAdult'], tmdb['budget'], tmdb['revenue'], tmdb['tmdbAverageScore'], tmdb['tmdbNumVotes'] = zip(*tmdb.raw)
    tmdb = tmdb.drop(columns=['raw'])

    tmdb = tmdb[
        (tmdb['budget'] != 0) &
        (tmdb['revenue'] != 0)
    ]

    print(f"tmdb shape: {tmdb.shape}")
    tmdb.to_csv("processed/tmdb.csv", index=False)

found existing csv file


In [121]:
if os.path.isfile("processed/tmdb_2024.csv"):
    print("found existing csv file")
    tmdb_2024 = pd.read_csv("processed/tmdb_2024.csv")       
else:
    tmdb_2024 = filtered_df[filtered_df['startYear'] == '2024']
    tmdb_2024 = tmdb_2024[['imdbId']]

    print(f"fetching api data for {tmdb_2024.shape[0]} movies")
    tmdb_2024['raw'] = tmdb_2024['imdbId'].apply(get_tmdb_data)

    tmdb_2024['productionCountries'], tmdb_2024['studios'], tmdb_2024['originalLanguage'], tmdb_2024['isAdult'], tmdb_2024['budget'], tmdb_2024['revenue'], tmdb_2024['tmdbAverageScore'], tmdb_2024['tmdbNumVotes'] = zip(*tmdb_2024.raw)
    tmdb_2024 = tmdb_2024.drop(columns=['raw'])

    tmdb_2024 = tmdb_2024[
        (tmdb_2024['budget'] != 0) &
        (tmdb_2024['revenue'] != 0)
    ]

    print(f"tmdb shape: {tmdb_2024.shape}")
    tmdb_2024.to_csv("processed/tmdb_2024.csv", index=False)

found existing csv file


In [122]:
tmdb.head()

Unnamed: 0,imdbId,productionCountries,studios,originalLanguage,isAdult,budget,revenue,tmdbAverageScore,tmdbNumVotes
0,tt0004972,United States of America,Epoch Film Co.,en,False,100000,11000000,6.0,542
1,tt0005078,United States of America,"Jesse L. Lasky Feature Play Company,Paramount ...",en,False,17311,137365,5.9,64
2,tt0006864,United States of America,"Triangle Film Corporation,Wark Producing Corp.",en,False,385907,1750000,7.1,341
3,tt0012349,United States of America,Charles Chaplin Productions,en,False,250000,2500000,8.2,2139
4,tt0013140,United States of America,Universal Film Manufacturing Company,en,False,1100000,400200,6.6,90


In [123]:
tmdb_2024.head()

Unnamed: 0,imdbId,productionCountries,studios,originalLanguage,isAdult,budget,revenue,tmdbAverageScore,tmdbNumVotes
0,tt10128846,United States of America,"American Zoetrope,Caesar Film",en,False,120000000,13857002,5.302,718
1,tt10655524,United States of America,"Columbia Pictures,Wayfarer Studios,Saks Pictur...",en,False,25000000,350993761,7.052,1505
2,tt10720352,"Canada,United States of America","Entertainment One,Tucker Tooley Entertainment,...",en,False,19000000,40829138,7.6,757
3,tt11057302,United States of America,"Columbia Pictures,di Bonaventura Pictures",en,False,80000000,100498764,5.376,2141
4,tt11152168,United States of America,"Paramount Pictures,Sunday Night Productions,Ma...",en,False,110000000,190309707,7.092,1298


In [124]:
tmdb = pd.concat([tmdb, tmdb_2024])

In [125]:
tmdb.shape

(5805, 9)

In [126]:
print(f"number of movies with information about budget and revenue from TMDb: {tmdb.shape[0]}")

number of movies with information about budget and revenue from TMDb: 5805


In [127]:
tmdb.head()

Unnamed: 0,imdbId,productionCountries,studios,originalLanguage,isAdult,budget,revenue,tmdbAverageScore,tmdbNumVotes
0,tt0004972,United States of America,Epoch Film Co.,en,False,100000,11000000,6.0,542
1,tt0005078,United States of America,"Jesse L. Lasky Feature Play Company,Paramount ...",en,False,17311,137365,5.9,64
2,tt0006864,United States of America,"Triangle Film Corporation,Wark Producing Corp.",en,False,385907,1750000,7.1,341
3,tt0012349,United States of America,Charles Chaplin Productions,en,False,250000,2500000,8.2,2139
4,tt0013140,United States of America,Universal Film Manufacturing Company,en,False,1100000,400200,6.6,90


In [128]:
filtered_df.head()

Unnamed: 0,imdbId,startYear,runtimeMinutes,genres,imdbScore,imdbNumVotes,title,audienceScore,rottenTomatoesNumVotes
0,tt0004099,1914,59.0,"Adventure,Comedy,Family",5.3,547,"His Majesty, the Scarecrow of Oz",53.0,1
1,tt0004457,1914,81.0,"Adventure,Comedy,Family",5.4,595,The Patchwork Girl of Oz,33.0,4
2,tt0004707,1914,82.0,Comedy,6.2,3791,Tillie's Punctured Romance,42.0,10
3,tt0004972,1915,195.0,"Drama,War",6.1,27113,The Birth of a Nation,54.0,42
4,tt0005078,1915,59.0,"Drama,Romance",6.5,2913,The Cheat,47.0,10


In [129]:
third_df = pd.merge(
    filtered_df,
    tmdb,
    how="inner",
    on="imdbId",
)

In [130]:
third_df.shape

(5715, 17)

In [131]:
third_df[third_df['title'] == "Dune: Part Two"]

Unnamed: 0,imdbId,startYear,runtimeMinutes,genres,imdbScore,imdbNumVotes,title,audienceScore,rottenTomatoesNumVotes,productionCountries,studios,originalLanguage,isAdult,budget,revenue,tmdbAverageScore,tmdbNumVotes
4306,tt15239678,2024,166.0,"Action,Adventure,Drama",8.5,591800,Dune: Part Two,94.0,456,United States of America,Legendary Pictures,en,False,190000000,714444358,8.149,6425


# LETTERBOXD

In [107]:
if os.path.isfile("processed/letterboxd.csv"):
    letterboxd = pd.read_csv("processed/letterboxd.csv")
else:
    letterboxd = filtered_df[['imdbId']]

    print(f"scraping letterboxd for {tmdb.shape[0]} movies")
    letterboxd['raw'] = letterboxd['imdbId'].apply(get_letterboxd_data)

    letterboxd['directors'], letterboxd['letterboxdAverageScore'], letterboxd['letterboxdNumVotes'] = zip(*letterboxd.raw)
    letterboxd = letterboxd.drop(columns=['raw'])
    
    letterboxd = letterboxd[
        (letterboxd['letterboxdAverageScore'] != 0) &
        (letterboxd['letterboxdNumVotes'] != 0)
    ]

    print(f"letterboxd shape: {letterboxd.shape}")
    letterboxd.to_csv("processed/letterboxd.csv", index=False)

In [None]:
if os.path.isfile("processed/letterboxd_2024.csv"):
    letterboxd_2024 = pd.read_csv("processed/letterboxd_2024.csv")
else:
    letterboxd_2024 = filtered_df[filtered_df['startYear'] == '2024']
    letterboxd_2024 = letterboxd_2024[['imdbId']]

    print(f"scraping letterboxd for {letterboxd_2024.shape[0]} movies")
    letterboxd_2024['raw'] = letterboxd_2024['imdbId'].apply(get_letterboxd_data)

    letterboxd_2024['directors'], letterboxd_2024['letterboxdAverageScore'], letterboxd_2024['letterboxdNumVotes'] = zip(*letterboxd_2024.raw)
    letterboxd_2024 = letterboxd_2024.drop(columns=['raw'])
    
    letterboxd_2024 = letterboxd_2024[
        (letterboxd_2024['letterboxdAverageScore'] != 0) &
        (letterboxd_2024['letterboxdNumVotes'] != 0)
    ]

    print(f"letterboxd shape: {letterboxd_2024.shape}")
    letterboxd_2024.to_csv("processed/letterboxd_2024.csv", index=False)

In [109]:
letterboxd = pd.concat([letterboxd, letterboxd_2024])

In [110]:
fourth_df = pd.merge(
    third_df,
    letterboxd,
    how="inner",
    on="imdbId",
)

In [111]:
print(f"final shape: {fourth_df.shape}")
fourth_df.head()

final shape: (5574, 20)


Unnamed: 0,imdbId,startYear,runtimeMinutes,genres,imdbScore,imdbNumVotes,title,audienceScore,rottenTomatoesNumVotes,productionCountries,studios,originalLanguage,isAdult,budget,revenue,tmdbAverageScore,tmdbNumVotes,directors,letterboxdAverageScore,letterboxdNumVotes
0,tt0004972,1915,195.0,"Drama,War",6.1,27113,The Birth of a Nation,54.0,42,United States of America,Epoch Film Co.,en,False,100000,11000000,6.0,542,D.W. Griffith,2.31,32086
1,tt0005078,1915,59.0,"Drama,Romance",6.5,2913,The Cheat,47.0,10,United States of America,"Jesse L. Lasky Feature Play Company,Paramount ...",en,False,17311,137365,5.9,64,Cecil B. DeMille,2.99,3745
2,tt0006864,1916,163.0,"Drama,History",7.7,17240,Intolerance,77.0,38,United States of America,"Triangle Film Corporation,Wark Producing Corp.",en,False,385907,1750000,7.1,341,D.W. Griffith,3.69,18974
3,tt0012349,1921,68.0,"Comedy,Drama,Family",8.2,139644,The Kid,95.0,50,United States of America,Charles Chaplin Productions,en,False,250000,2500000,8.2,2139,Charlie Chaplin,4.21,102205
4,tt0013140,1922,117.0,"Drama,Thriller",7.0,4163,Foolish Wives,70.0,18,United States of America,Universal Film Manufacturing Company,en,False,1100000,400200,6.6,90,Erich von Stroheim,3.67,4822


In [113]:
fourth_df[fourth_df['title'] == "Dune: Part Two"]

Unnamed: 0,imdbId,startYear,runtimeMinutes,genres,imdbScore,imdbNumVotes,title,audienceScore,rottenTomatoesNumVotes,productionCountries,studios,originalLanguage,isAdult,budget,revenue,tmdbAverageScore,tmdbNumVotes,directors,letterboxdAverageScore,letterboxdNumVotes


In [112]:
fourth_df.to_csv('processed/movies.csv')