In [1]:
import pandas as pd
import numpy as np
import requests
import dotenv
import os
import time
import nltk
import tqdm
# nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

In [5]:
#Setup for scraping
dotenv.load_dotenv()
api_key = os.getenv('tmdb_read_key')
ids_df = pd.read_csv("../data/movies_15_to_19.csv")
print(f"Number of movies to scrape: {ids_df.shape[0]}")

Number of movies to scrape: 152774


In [9]:
def get_cast_info(id, max_retries=5):
    retry_count = 0
    base_wait_time = 1
    
    while retry_count < max_retries:
        movie_id = id
        url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits"

        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {api_key}",
        }

        params = {"language": "en-US"}

        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 429:
            wait_time = base_wait_time * (2 ** retry_count)
            print(f"Rate limited on movie {movie_id}. Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            retry_count += 1
            continue
            
        if response.status_code == 200:
            row_df = pd.json_normalize(response.json(), record_path=['cast'])
            if "gender" in row_df.columns:
                gender_split = row_df['gender'].value_counts(normalize=True)
            else:
                gender_split = pd.Series(dtype=float)
            if "popularity" in row_df.columns:
                max_popularity = row_df['popularity'].max()
                min_popularity = row_df['popularity'].min()
                avg_popularity = row_df['popularity'].mean()
                median_popularity = row_df['popularity'].median()
            else:
                max_popularity = np.nan
                min_popularity = np.nan
                avg_popularity = np.nan
                median_popularity = np.nan
            
            num_cast = row_df.shape[0]

            return pd.DataFrame({
                'movie_id': movie_id,
                'gender_split': gender_split,
                'max_popularity': max_popularity,
                'min_popularity': min_popularity,
                'avg_popularity': avg_popularity,
                'median_popularity': median_popularity,
                'num_cast': num_cast
            }, index=[0])
        else:
            return None
    
    print(f"Max retries reached for movie {movie_id}")
    return None

def get_keyword_info(movie_id, max_retries=5):
    retry_count = 0
    base_wait_time = 1
    
    while retry_count < max_retries:
        url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords"

        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {api_key}",
        }

        params = {"language": "en-US"}

        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 429:
            wait_time = base_wait_time * (2 ** retry_count)
            print(f"Rate limited on movie {movie_id}. Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            retry_count += 1
            continue
            
        scores = pd.DataFrame()

        if response.status_code == 200:
            keywords = response.json().get('keywords', [])
            keyword_names = [keyword['name'] for keyword in keywords]
            sia = SentimentIntensityAnalyzer()
            scores = pd.DataFrame([sia.polarity_scores(kw).values() for kw in keyword_names], columns=['neg', 'neu', 'pos', 'compound'])
        
        if not scores.empty:
            return pd.DataFrame({
                "review_mean_neg_sentiment": scores["neg"].mean(),
                "review_mean_neu_sentiment": scores["neu"].mean(),
                "review_mean_pos_sentiment": scores["pos"].mean(),
                "review_mean_compound_sentiment": scores["compound"].mean(),
            }, index=[0])
        else:
            return pd.DataFrame({
                "review_mean_neg_sentiment": np.nan,
                "review_mean_neu_sentiment": np.nan,
                "review_mean_pos_sentiment": np.nan,
                "review_mean_compound_sentiment": np.nan,
            }, index=[0])
    
    print(f"Max retries reached for movie {movie_id}")
    return pd.DataFrame({
        "review_mean_neg_sentiment": np.nan,
        "review_mean_neu_sentiment": np.nan,
        "review_mean_pos_sentiment": np.nan,
        "review_mean_compound_sentiment": np.nan,
    }, index=[0])

In [None]:
# Initialize lists to store results
# cast_results = []
# keyword_results = []

# # Iterate through each movie in ids_df
# for idx, row in tqdm.tqdm(ids_df.iterrows(), total=ids_df.shape[0]):
#     movie_id = row['id']
    
#     # Get cast information
#     cast_info = get_cast_info(movie_id)
#     if cast_info is not None:
#         cast_results.append(cast_info)
    
#     # Get keyword information
#     keyword_info = get_keyword_info(movie_id)
#     keyword_results.append(keyword_info)
    

# # Concatenate all results into dataframes
# cast_df = pd.concat(cast_results, ignore_index=True)
# keywords_df = pd.concat(keyword_results, ignore_index=True)

100%|██████████| 152774/152774 [10:15:54<00:00,  4.13it/s]  


KeyError: 'movie_id'

In [None]:
# Merge the results with the original ids_df

enriched_df = enriched_df.merge(keywords_df, left_on='id', right_on='movie_id', how='left', suffixes=('', '_keyword'))

print(f"\nEnriched dataframe shape: {enriched_df.shape}")
print(enriched_df.head())

Unnamed: 0,movie_id,gender_split,max_popularity,min_popularity,avg_popularity,median_popularity,num_cast
0,271039,0.428571,1.0347,0.0193,0.354657,0.03790,7
1,296917,,3.1799,0.1421,1.360221,1.13675,24
2,312849,0.125000,1.7608,0.0168,0.654975,0.55140,8
3,207703,0.176471,8.6556,0.0214,1.167616,0.52930,51
4,336806,0.466667,1.2933,0.0143,0.183733,0.07190,15
...,...,...,...,...,...,...,...
152713,644022,,,,,,0
152714,636066,,0.1683,0.1683,0.168300,0.16830,1
152715,622240,1.000000,0.0168,0.0000,0.004775,0.00000,8
152716,622218,,,,,,0


In [25]:
# Join ids_df and keywords_df on their index
enriched_df = ids_df.join(keywords_df).drop(columns=["Unnamed: 0"])

In [26]:
enriched_df = enriched_df.merge(cast_df, left_on='id', right_on='movie_id', how='left')

In [27]:
enriched_df

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,...,review_mean_neu_sentiment,review_mean_pos_sentiment,review_mean_compound_sentiment,movie_id,gender_split,max_popularity,min_popularity,avg_popularity,median_popularity,num_cast
0,False,/7kGhq8nROnNGw6uZoUMFSe389VL.jpg,"[10749, 18]",271039,en,"Something, Anything",When a tragedy shatters her plans for domestic...,12.0941,/xmqqgTz5ceMjoFMe831WXtEDKsc.jpg,2015-01-09,...,1.000000,0.000000,0.000000,271039.0,0.428571,1.0347,0.0193,0.354657,0.03790,7.0
1,False,/8jnHGZ4vguLcctDHlMvaAyG7mjb.jpg,"[16, 878, 28]",296917,ja,劇場版 PSYCHO-PASS サイコパス,"In a futuristic Japan, the Sibyl System is cha...",9.3462,/hUlhPosXp62uuTS0c2aINdg8cvV.jpg,2015-01-09,...,0.457714,0.000000,-0.339829,296917.0,,3.1799,0.1421,1.360221,1.13675,24.0
2,False,/pw9oCxdHISiiWYjthGO9iXXcVtw.jpg,"[18, 14, 53]",312849,tr,Sarmaşık,"After the owner's bankruptcy, the crew is stra...",8.6511,/jsqgoFLDX4o0dNkdjB8UG83Rzng.jpg,2015-01-26,...,,,,312849.0,0.125000,1.7608,0.0168,0.654975,0.55140,8.0
3,False,/qzUIOTk0E3F1zjvYjcBRTKUTgf9.jpg,"[80, 35, 28, 12]",207703,en,Kingsman: The Secret Service,The story of a super-secret spy organization t...,8.1795,/r6q9wZK5a2K51KFj4LWVID6Ja1r.jpg,2015-01-24,...,0.606556,0.393444,0.200878,207703.0,0.176471,8.6556,0.0214,1.167616,0.52930,51.0
4,False,/2KBm2Q6NzwdtOH5Gav9WgkdHlpw.jpg,[18],336806,sv,Efterskalv,When John returns home to his father after ser...,7.7409,/taRuclzevcoTBNlT72rlfdILxsd.jpg,2015-01-20,...,0.725000,0.000000,-0.230267,336806.0,0.466667,1.2933,0.0143,0.183733,0.07190,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152769,False,,[],644022,fr,Premiers pas dans la forêt,,0.0071,/3o1rCUdSA7rh9kp0vbAqH1lWIo9.jpg,2019-12-04,...,,,,644022.0,,,,,,0.0
152770,False,,[99],636066,en,Hong Kong: Fight For Freedom!,Stefan Molyneux goes to the streets of Hong Ko...,0.0000,/5E5Vstbrj6ZNrijoLiZ0ZD4oCH9.jpg,2019-12-09,...,,,,636066.0,,0.1683,0.1683,0.168300,0.16830,1.0
152771,False,,"[18, 28]",622240,en,Great Job,The tale of a lepidopterist and his encounter ...,0.0071,,2019-12-20,...,,,,622240.0,1.000000,0.0168,0.0000,0.004775,0.00000,8.0
152772,False,,[],622218,es,O verde,It snows in the image. The light samples recor...,0.0000,/1mfn9Ic8VsBitMikTOuubPt51j7.jpg,2019-12-18,...,,,,622218.0,,,,,,0.0


In [28]:
enriched_df.to_csv("../data/movies_15_to_19_enriched.csv", index=False)