#GET DATA ID FILES

In [None]:
import json
import pandas as pd
import requests

In [None]:
API_KEY = 'ae8f44a0db772e7e8f2d45b9fad24017'

In [None]:
#Genre Mapping Dictionary ; to be used later
GENRE_LIST_URL = f"https://api.themoviedb.org/3/genre/movie/list?api_key={API_KEY}&language=en-US"

def get_genre_mapping():
    response = requests.get(GENRE_LIST_URL)
    if response.status_code == 200:
        data = response.json()
        # Build a dictionary: key = genre id, value = genre name
        genre_mapping = { genre['id']: genre['name'] for genre in data.get('genres', []) }
        return genre_mapping
    else:
        print(f"Error fetching genres: Status Code {response.status_code}")
        return {}

# Fetch the mapping
genre_dict = get_genre_mapping()

In [None]:
genre_dict

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [None]:
# Extract all movies for Marvel Studios
START_YEAR = '2008-01-01'
END_YEAR = '2019-12-31'
company_id = 420

# Discover Movies helper function to get data for all MCU productions between 2008-2025
def discover_movies(company_id):
    page = 1
    movies = []
    while True:
        url = (
            f'https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}'
            f'&with_companies={company_id}'
            f'&primary_release_date.gte={START_YEAR}'
            f'&primary_release_date.lte={END_YEAR}'
            f'&sort_by=release_date.desc'
            f'&page={page}'
        )
        response = requests.get(url)
        data = response.json()
        movies.extend(data['results'])

        if page >= data['total_pages']:
            break
        page += 1
    return movies

In [None]:
if company_id:
    movies = discover_movies(company_id)
else:
    print("Production company not found.")

In [None]:
all_movie_df = pd.DataFrame(movies)
all_movie_df = all_movie_df[['poster_path','genre_ids','id','title','original_language','overview','popularity','release_date','vote_average','vote_count']]

In [None]:
#Code to filter out only MCU movies
import itertools

def get_frequency_of_values_in_lists(data):
    """
    Counts the frequency of values in a list of lists.

    Args:
        data: A list of lists.

    Returns:
        A dictionary where keys are values from the lists and values are their counts.
    """

    flattened_list = list(itertools.chain.from_iterable(data))
    frequency_counts = {}

    for item in flattened_list:
        frequency_counts[item] = frequency_counts.get(item, 0) + 1

    return frequency_counts

In [None]:
pd.DataFrame({genre_dict[k]:{"id":k,"count":v} for k,v in get_frequency_of_values_in_lists(all_movie_df['genre_ids']).items()})

Unnamed: 0,Documentary,Science Fiction,Action,Adventure,Comedy,Fantasy,Drama,TV Movie,History,Animation,Crime,Family
id,99,878,28,12,35,14,18,10770,36,16,80,10751
count,3,31,33,29,5,13,1,1,1,3,1,1


In [None]:
# Define genres to exclude : Discard all documentaries (99),Animation(16), Crime(80), History(36)
excluded_genres = {99, 16, 80, 36}

#And also filter out movies which are one-shot or tie-ins
exclusion_string = "marvel one|peter's|holiday special|team"

movie_df = all_movie_df[all_movie_df['genre_ids'].apply(lambda x: not any(genre in excluded_genres for genre in x)) & ~all_movie_df['title'].str.lower().str.contains(exclusion_string)]

In [None]:
movie_df

Unnamed: 0,poster_path,genre_ids,id,title,original_language,overview,popularity,release_date,vote_average,vote_count
2,/4q2NNj4S5dG2RLF9CpXsej7yXl.jpg,"[28, 12, 878]",429617,Spider-Man: Far From Home,en,Peter Parker and his friends go on a summer tr...,31.1556,2019-06-28,7.41,16050
3,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,"[12, 878, 28]",299534,Avengers: Endgame,en,After the devastating events of Avengers: Infi...,24.3324,2019-04-24,8.238,26302
4,/AtsgWhDnHTq68L0lLsUrCnM7TjG.jpg,"[28, 12, 878]",299537,Captain Marvel,en,The story follows Carol Danvers as she becomes...,233.0808,2019-03-06,6.802,16010
5,/cFQEO687n1K6umXbInzocxcnAQz.jpg,"[28, 12, 878]",363088,Ant-Man and the Wasp,en,Just when his time under house arrest is about...,8.488,2018-07-04,6.92,13541
6,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,"[12, 28, 878]",299536,Avengers: Infinity War,en,As the Avengers and their allies have continue...,36.2056,2018-04-25,8.236,30490
8,/uxzzxijgPIY7slzFvMotPv8wjKA.jpg,"[28, 12, 878]",284054,Black Panther,en,"King T'Challa returns home to the reclusive, t...",16.3835,2018-02-13,7.373,22539
9,/rzRwTcFvttcN1ZpX2xv4j3tSdJu.jpg,"[28, 12, 878]",284053,Thor: Ragnarok,en,Thor is imprisoned on the other side of the un...,12.5682,2017-10-02,7.584,21104
10,/c24sv2weTHPsmDa7jEMN0m2P3RT.jpg,"[28, 12, 878, 18]",315635,Spider-Man: Homecoming,en,Following the events of Captain America: Civil...,18.1736,2017-07-05,7.331,22178
11,/y4MBh0EjBlMuOzv9axM4qJlmhzz.jpg,"[878, 12, 28]",283995,Guardians of the Galaxy Vol. 2,en,The Guardians must fight to keep their newfoun...,13.5264,2017-04-19,7.612,21940
13,/uGBVj3bEbCoZbDjjl9wTxcygko1.jpg,"[28, 12, 14]",284052,Doctor Strange,en,"After his career is destroyed, a brilliant but...",21.3453,2016-10-25,7.418,22509


In [None]:
movie_df['id'].unique()

array([429617, 299534, 299537, 363088, 299536, 284054, 284053, 315635,
       283995, 284052, 271110, 102899,  99861, 118340, 100402,  76338,
        68721,  24428,   1771,  10195,  10138,   1724,   1726])

In [None]:
import time
from tqdm import tqdm

In [None]:
#Movie Cast API
def get_movie_cast(movie_ids):
    base_url = 'https://api.themoviedb.org/3/movie/'
    cast_data = {}

    for movie_id in tqdm(movie_ids):
        url = f"{base_url}{movie_id}/credits?api_key={API_KEY}"
        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()
            cast_list = data.get('cast', [])
            actor_info = [(actor['name'], actor['id'],actor['popularity'],actor['character'],movie_id) for actor in cast_list if (actor['popularity']>1.0 and actor['known_for_department']=='Acting')]
            cast_data[movie_id] = actor_info
        else:
            print(f"❌ Failed to fetch cast for movie ID {movie_id} (Status {response.status_code})")
            cast_data[movie_id] = []
        time.sleep(1)

    return cast_data, response

In [None]:
cast_data,response = get_movie_cast(movie_df['id'])

100%|██████████| 23/23 [00:26<00:00,  1.15s/it]


#Get Movie cast list for each and every movie and keep only unique actors

In [None]:
movie_cast_list = pd.DataFrame.from_dict(cast_data,orient='index').values.tolist()

In [None]:
unique_actors = set([j for i in movie_cast_list for j in i])

In [None]:
actors = pd.DataFrame(unique_actors,columns=['name','id','popularity','character','movie_id'])
actors.dropna(axis=0,inplace=True)
actors['id'] = actors['id'].astype(int)

In [None]:
actors['movie_id'] = actors['movie_id'].astype("category")
actors['movie_id'] = actors['movie_id'].cat.set_categories(movie_df['id'])
actors.sort_values(["movie_id"],ascending=False,inplace=True)

In [None]:
actors.drop_duplicates(['name'],keep='first',inplace=True)

In [None]:
lead_filter = actors['popularity'].mean()

In [None]:
lead_filter

np.float64(2.910201393728223)

#Get lead actors vs non lead actors dataframe : Used for scalability in further applications

In [None]:
#Number of lead actors
lead_actors_df = actors[actors['popularity']>=lead_filter]

In [None]:
#Number of not so lead actors
actors[actors['popularity']<lead_filter]

Unnamed: 0,name,id,popularity,character,movie_id
650,Shaun Toub,17857,1.9773,Yinsen,1726
655,Joshua Harto,34544,1.4475,CAOC Analyst,1726
699,Kevin Foster,95698,1.0430,Jimmy,1726
506,Bill Smitrovich,17200,2.1970,General Gabriel,1726
706,Micah A. Hauptman,150669,1.0197,CAOC Analyst,1726
...,...,...,...,...,...
118,JB Smoove,65920,2.4921,Mr. Dell,429617
127,Anjana Vasan,1503076,1.1538,Queens Reporter,429617
520,Meagan Holder,1378683,1.3179,Pretty Tourist (uncredited),429617
221,Massi Furlan,1010873,1.1199,Flight Attendant (uncredited),429617


In [None]:
lead_actors_df

Unnamed: 0,name,id,popularity,character,movie_id
93,Gwyneth Paltrow,12052,4.1431,Pepper Potts,1726
525,Robert Downey Jr.,3223,11.2703,Tony Stark,1726
513,Terrence Howard,18288,4.6701,Rhodey,1726
82,Jeff Bridges,1229,4.6122,Obadiah Stane,1726
485,Samuel L. Jackson,2231,10.9680,Nick Fury (uncredited),1726
...,...,...,...,...,...
793,Ken Jeong,83586,4.8592,Security Guard,299534
775,James D'Arcy,19655,3.1687,Jarvis,299534
63,Hiroyuki Sanada,9195,4.9882,Akihiko,299534
621,Jake Gyllenhaal,131,8.2200,Quentin Beck / Mysterio,429617


In [None]:
import time
from tqdm import tqdm

#Get revenue, review data for each actor

In [None]:
def get_actor_movies_with_money_info(actor_name, actor_id, start_year=2003, end_year=2024):
    print(f"For {actor_name}")
    base_url = "https://api.themoviedb.org/3"
    credits_url = f"{base_url}/person/{actor_id}/movie_credits?api_key={API_KEY}"

    response = requests.get(credits_url)
    if response.status_code != 200:
        print(f"Failed to fetch credits for actor {actor_id}")
        return []

    movies = response.json().get("cast", [])
    selected_movies = []
    num_req = 0

    for movie in tqdm(movies):
        movie_id = movie.get("id")
        movie_name = movie.get("title")
        release_date = movie.get("release_date", "")

        # Skip if no valid release date
        try:
            year = int(release_date[:4])
        except:
            continue

        if not (start_year <= year <= end_year):
            continue

        # Fetch movie details
        movie_details_url = f"{base_url}/movie/{movie_id}?api_key={API_KEY}"
        details_response = requests.get(movie_details_url)
        if details_response.status_code != 200:
            continue

        details = details_response.json()
        budget = details.get("budget", 0)
        revenue = details.get("revenue", 0)
        runtime = details.get("runtime", 0)
        release_date = details.get("release_date", "")
        genres = details.get("genres", [])

        # Fetch cast order
        credits_url = f"{base_url}/movie/{movie_id}/credits?api_key={API_KEY}"
        credits_response = requests.get(credits_url)
        if credits_response.status_code != 200:
            cast_order = None
        else:
            cast_data = credits_response.json().get("cast", [])
            cast_order = next((c['order'] for c in cast_data if c['id'] == actor_id), None)

        selected_movies.append(
            (movie_id, movie_name, budget, revenue, runtime, release_date, genres, cast_order)
        )

        num_req += 2  # 2 requests per movie (details + credits)
        if num_req >= 48:
            time.sleep(1)
            num_req = 0

    return selected_movies

In [None]:
#Treatment group specified : for this causal analysis
main_actors_str = '''Robert Downey Jr.
Chris Evans
Scarlett Johansson
Mark Ruffalo
Jeremy Renner
Tom Holland
Benedict Cumberbatch
Chadwick Boseman
Paul Rudd
Tom Hiddleston
Chris Pratt
Chris Hemsworth'''

In [None]:
main_actors_str.split("\n")

['Robert Downey Jr.',
 'Chris Evans',
 'Scarlett Johansson',
 'Mark Ruffalo',
 'Jeremy Renner',
 'Tom Holland',
 'Benedict Cumberbatch',
 'Chadwick Boseman',
 'Paul Rudd',
 'Tom Hiddleston',
 'Chris Pratt',
 'Chris Hemsworth']

In [None]:
avengers = actors[actors['name'].isin(main_actors_str.split("\n"))]

In [None]:
import numpy as np

In [None]:
#For each actor get their revenue, ratings, reviews, genres for all movies
for actor_name,actor_id in zip(avengers['name'],avengers['id']):
  acting_list = get_actor_movies_with_money_info(actor_name,actor_id)

  acted_df = pd.DataFrame(acting_list,columns=['movie_id','title','budget','revenue','runtime','release_date','genres','cast_order'])
  acted_df['genres'] = acted_df['genres'].apply(lambda x: [i['name'] for i in x if i['id'] not in excluded_genres])

  acted_df['genres'] = acted_df['genres'].apply(lambda x: np.nan if x == [] else x)
  acted_df.dropna(how='any', axis=0,inplace=True)
  acted_df['actor_name'] = actor_name
  acted_df[acted_df['revenue']>0].to_json(f"avenger_{actor_id}.json")

For Robert Downey Jr.


100%|██████████| 127/127 [00:11<00:00, 10.67it/s]


For Scarlett Johansson


100%|██████████| 101/101 [00:22<00:00,  4.57it/s]


For Jeremy Renner


100%|██████████| 56/56 [00:12<00:00,  4.47it/s]


For Tom Hiddleston


100%|██████████| 60/60 [00:15<00:00,  3.86it/s]


For Chris Hemsworth


100%|██████████| 59/59 [00:12<00:00,  4.58it/s]


For Chris Evans


100%|██████████| 66/66 [00:15<00:00,  4.35it/s]


For Mark Ruffalo


100%|██████████| 89/89 [00:17<00:00,  5.17it/s]


For Chris Pratt


100%|██████████| 65/65 [00:16<00:00,  3.84it/s]


For Paul Rudd


100%|██████████| 112/112 [00:26<00:00,  4.23it/s]


For Chadwick Boseman


100%|██████████| 31/31 [00:08<00:00,  3.70it/s]


For Tom Holland


100%|██████████| 36/36 [00:07<00:00,  4.89it/s]


For Benedict Cumberbatch


100%|██████████| 109/109 [00:27<00:00,  4.02it/s]


In [None]:
import numpy as np
import glob

#Export data for treatment and control group

In [None]:
all_data = []
avenger_df = pd.DataFrame()
for file_path in glob.glob('avenger_*.json'):
  try:
    with open(file_path, 'r') as f:
      data = json.load(f)
      avenger_df=pd.concat([avenger_df,pd.DataFrame(data)],ignore_index=True)
  except FileNotFoundError:
    print(f"Error: File not found: {file_path}")
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in file: {file_path}")

In [None]:
avenger_df

Unnamed: 0,movie_id,title,budget,revenue,runtime,release_date,genres,cast_order,actor_name
0,4964,Knocked Up,30000000,219900000,129,2007-06-01,"[Comedy, Romance, Drama]",2,Paul Rudd
1,22958,The Shape of Things,0,735992,96,2003-07-24,"[Comedy, Drama, Romance]",0,Paul Rudd
2,6575,Walk Hard: The Dewey Cox Story,35000000,18317151,96,2007-12-21,"[Comedy, Music]",68,Paul Rudd
3,6957,The 40 Year Old Virgin,26000000,177400000,116,2005-08-11,"[Comedy, Romance]",2,Paul Rudd
4,8699,Anchorman: The Legend of Ron Burgundy,26000000,90574188,95,2004-06-28,[Comedy],2,Paul Rudd
...,...,...,...,...,...,...,...,...,...
351,455980,Tag,28000000,75100000,100,2018-05-30,"[Comedy, Action]",2,Jeremy Renner
352,345914,The House,40000000,34200000,88,2017-06-29,[Comedy],16,Jeremy Renner
353,299534,Avengers: Endgame,356000000,2799439100,181,2019-04-24,"[Adventure, Science Fiction, Action]",5,Jeremy Renner
354,497698,Black Widow,200000000,379751131,134,2021-07-07,"[Action, Adventure, Science Fiction]",89,Jeremy Renner


In [None]:
OMDB_API_KEY = "4280db16"

In [None]:
def get_rotten_tomatoes_rating(movie_title):
    url = f"http://www.omdbapi.com/?t={movie_title}&apikey={OMDB_API_KEY}"
    response = requests.get(url)
    if response.status_code != 200:
        return None
    ratings = response.json().get("Ratings", [])
    imdb_id = response.json().get("imdbID", [])
    imdb_votes = response.json().get("imdbVotes", [])
    try:
      return imdb_votes,imdb_id, [{i['Source']:i["Value"]} for i in ratings]
    # for rating in ratings:
    #     if rating["Source"] == "Rotten Tomatoes":
    #         return rating["Value"]
    except Exception as e:
      print(e)
      return None,None, None

In [None]:
get_rotten_tomatoes_rating("Anchorman: The Legend of Ron Burgundy")

('388,982',
 'tt0357413',
 [{'Internet Movie Database': '7.1/10'},
  {'Rotten Tomatoes': '66%'},
  {'Metacritic': '63/100'}])

In [None]:
tqdm.pandas()

In [None]:
avenger_df['imdb_votes'],avenger_df['imdb_id'],avenger_df['movie_ratings'] = zip(*list(avenger_df['title'].progress_apply(get_rotten_tomatoes_rating)))

100%|██████████| 356/356 [00:41<00:00,  8.49it/s]


In [None]:
avenger_df['movie_ratings'][0]

[{'Internet Movie Database': '6.9/10'},
 {'Rotten Tomatoes': '90%'},
 {'Metacritic': '85/100'}]

In [None]:
# Function to flatten ratings
def extract_ratings(rating_list):
    rating_dict = {}
    for item in rating_list:
        rating_dict.update(item)
    return rating_dict

# Convert ratings to proper numeric formats
def convert_imdb(val):
    try:
        return float(val.split('/')[0])
    except:
        return None

def convert_rt(val):
    try:
        return int(val.strip('%'))
    except:
        return None

def convert_mc(val):
    try:
        return int(val.split('/')[0])
    except:
        return None

In [None]:
# Apply and expand to new columns
ratings_expanded = avenger_df['movie_ratings'].apply(extract_ratings).apply(pd.Series)

ratings_expanded['Internet Movie Database'] = ratings_expanded['Internet Movie Database'].apply(convert_imdb)
ratings_expanded['Rotten Tomatoes'] = ratings_expanded['Rotten Tomatoes'].apply(convert_rt)
ratings_expanded['Metacritic'] = ratings_expanded['Metacritic'].apply(convert_mc)

# Merge into original dataframe
avenger_df_processed = avenger_df.drop(columns=['movie_ratings']).join(ratings_expanded)

In [None]:
avenger_df_processed.head()

Unnamed: 0,movie_id,title,budget,revenue,runtime,release_date,genres,cast_order,actor_name,imdb_votes,imdb_id,Internet Movie Database,Rotten Tomatoes,Metacritic
0,4964,Knocked Up,30000000,219900000,129,2007-06-01,"[Comedy, Romance, Drama]",2,Paul Rudd,392030,tt0478311,6.9,90.0,85.0
1,22958,The Shape of Things,0,735992,96,2003-07-24,"[Comedy, Drama, Romance]",0,Paul Rudd,11962,tt0308878,6.6,64.0,59.0
2,6575,Walk Hard: The Dewey Cox Story,35000000,18317151,96,2007-12-21,"[Comedy, Music]",68,Paul Rudd,78396,tt0841046,6.8,74.0,63.0
3,6957,The 40 Year Old Virgin,26000000,177400000,116,2005-08-11,"[Comedy, Romance]",2,Paul Rudd,482299,tt0405422,7.1,85.0,73.0
4,8699,Anchorman: The Legend of Ron Burgundy,26000000,90574188,95,2004-06-28,[Comedy],2,Paul Rudd,388982,tt0357413,7.1,66.0,63.0


In [None]:
avenger_df_processed.groupby('actor_name').agg({"budget":"mean","revenue":"mean","runtime":"mean","Internet Movie Database":"mean","Rotten Tomatoes":"mean","Metacritic":"mean"})

Unnamed: 0_level_0,budget,revenue,runtime,Internet Movie Database,Rotten Tomatoes,Metacritic
actor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Benedict Cumberbatch,99309380.0,477676200.0,123.5625,7.253125,72.387097,65.2
Chadwick Boseman,119666700.0,644572300.0,130.416667,7.166667,70.583333,61.333333
Chris Evans,111356800.0,454828900.0,116.324324,6.889189,62.2,56.513514
Chris Hemsworth,130925900.0,497878300.0,122.444444,6.911111,67.115385,59.769231
Chris Pratt,111934400.0,498414900.0,119.0625,6.69375,61.21875,56.548387
Jeremy Renner,90387930.0,398120200.0,121.185185,7.011111,70.888889,63.555556
Mark Ruffalo,75556410.0,352555800.0,119.205128,6.984615,69.810811,62.615385
Paul Rudd,70595260.0,234423100.0,108.263158,6.505263,64.027778,59.694444
Robert Downey Jr.,104287900.0,473345600.0,121.515152,6.966667,66.424242,61.69697
Scarlett Johansson,83657450.0,380980400.0,116.930233,6.995349,70.756098,63.418605


In [None]:
from bs4 import BeautifulSoup

In [None]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; AcademicBot/1.0; +http://youruniversity.edu)"
}

def get_bom_opening_weekend(imdb_id):
    url = f"https://www.boxofficemojo.com/title/{imdb_id}/?ref_=bo_hm_rd"
    response = requests.get(url, headers=HEADERS)
    html_test= response.text

    soup = BeautifulSoup(html_test, 'html.parser')

    # Find all divs in case structure varies
    sections = soup.find_all('div', class_='a-section a-spacing-none')

    opening_weekend = None

    for section in sections:
        labels = section.find_all('span')
        if labels and 'Domestic Opening' in labels[0].text:
            money_tag = section.find('span', class_='money')
            if money_tag:
                opening_weekend = money_tag.text.strip()
                break

    return opening_weekend

def get_opening_weekend_bom(imdb_id, title, year=None):
    if not imdb_id:
        print(f"❌ IMDb ID not found for {title}")
        return "$0"

    opening = get_bom_opening_weekend(imdb_id)
    return opening

In [None]:
avenger_df_processed['opening_weekend'] = avenger_df_processed.progress_apply(lambda x: get_opening_weekend_bom(x['imdb_id'],x['title']),axis=1)

 90%|█████████ | 322/356 [03:22<00:25,  1.36it/s]

❌ IMDb ID not found for Tinker Bell and the Pirate Fairy


100%|██████████| 356/356 [03:43<00:00,  1.59it/s]


In [None]:
def parse_money(money_str):
    if not money_str:
        return None
    return int(money_str.replace('$', '').replace(',', ''))

avenger_df_processed['opening_weekend'] = avenger_df_processed['opening_weekend'].apply(parse_money)

In [None]:
avenger_df_processed.head()

Unnamed: 0,movie_id,title,budget,revenue,runtime,release_date,genres,cast_order,actor_name,imdb_votes,imdb_id,Internet Movie Database,Rotten Tomatoes,Metacritic,opening_weekend
0,4964,Knocked Up,30000000,219900000,129,2007-06-01,"[Comedy, Romance, Drama]",2,Paul Rudd,392030,tt0478311,6.9,90.0,85.0,30690990.0
1,22958,The Shape of Things,0,735992,96,2003-07-24,"[Comedy, Drama, Romance]",0,Paul Rudd,11962,tt0308878,6.6,64.0,59.0,173246.0
2,6575,Walk Hard: The Dewey Cox Story,35000000,18317151,96,2007-12-21,"[Comedy, Music]",68,Paul Rudd,78396,tt0841046,6.8,74.0,63.0,4174383.0
3,6957,The 40 Year Old Virgin,26000000,177400000,116,2005-08-11,"[Comedy, Romance]",2,Paul Rudd,482299,tt0405422,7.1,85.0,73.0,21422815.0
4,8699,Anchorman: The Legend of Ron Burgundy,26000000,90574188,95,2004-06-28,[Comedy],2,Paul Rudd,388982,tt0357413,7.1,66.0,63.0,28416365.0


In [None]:
avenger_df_processed1 = avenger_df_processed.copy()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
one_hot = pd.DataFrame(mlb.fit_transform(avenger_df_processed1['genres']), columns=mlb.classes_, index=avenger_df_processed1.index)

# Merge with original DataFrame
avenger_df_processed1 = pd.concat([avenger_df_processed1.drop(columns=['genres']), one_hot], axis=1)

In [None]:
avenger_df_sorted = avenger_df_processed1.sort_values(['release_date'],ascending=True)

In [None]:
avenger_df_sorted['MCU'] = avenger_df_sorted.apply(lambda x: 1 if x['movie_id'] in movie_df['id'].unique() else 0,axis=1)

In [None]:
def keep_first_one_only(df, group_col='actor_name', flag_col='MCU'):
    # Create a column tracking cumulative sum of 1s per group
    df['_cumsum'] = df.groupby(group_col)[flag_col].cumsum()

    # Set flag to 0 if it's a 1 and it's not the first one
    df["MCU Entry"] = df.apply(lambda row: 1 if row[flag_col] == 1 and row['_cumsum'] == 1 else 0, axis=1)

    # Drop helper column
    df.drop(columns=['_cumsum'], inplace=True)
    return df

avenger_df_sorted = keep_first_one_only(avenger_df_sorted)

In [None]:
avenger_df_sorted.to_json("treatment_group_data_raw.json")

In [None]:
#We need to export data for the control group as well

non_avengers = {
    "Ben Foster":11107,
    "Channing Tatum":38673,
    "Charlize Theron":6885,
    "Edward Norton":819,
    "Jim Carrey":206,
    "John David Washington":1117313,
    "Matthew Goode":1247,
    "Ryan Reynolds":10859,
    "Sam Worthington":65731,
    "Steve Carell":4495,
    "Taron Egerton":1303037,
    "Tye Sheridan":1034681
}


for actor_name,actor_id in non_avengers.items():
  acting_list = get_actor_movies_with_money_info(actor_name,actor_id)

  acted_df = pd.DataFrame(acting_list,columns=['movie_id','title','budget','revenue','runtime','release_date','genres','cast_order'])
  acted_df['genres'] = acted_df['genres'].apply(lambda x: [i['name'] for i in x if i['id'] not in excluded_genres])

  acted_df['genres'] = acted_df['genres'].apply(lambda x: np.nan if x == [] else x)
  acted_df.dropna(how='any', axis=0,inplace=True)
  acted_df['actor_name'] = actor_name
  acted_df[acted_df['revenue']>0].to_json(f"non_avenger_{actor_id}.json")

For Ben Foster


100%|██████████| 58/58 [00:13<00:00,  4.34it/s]


For Channing Tatum


100%|██████████| 65/65 [00:16<00:00,  4.03it/s]


For Charlize Theron


100%|██████████| 72/72 [00:13<00:00,  5.38it/s]


For Edward Norton


100%|██████████| 72/72 [00:12<00:00,  5.61it/s]


For Jim Carrey


100%|██████████| 87/87 [00:12<00:00,  6.73it/s]


For John David Washington


100%|██████████| 17/17 [00:04<00:00,  3.90it/s]


For Matthew Goode


100%|██████████| 37/37 [00:10<00:00,  3.54it/s]


For Ryan Reynolds


100%|██████████| 95/95 [00:21<00:00,  4.52it/s]


For Sam Worthington


100%|██████████| 60/60 [00:13<00:00,  4.40it/s]


For Steve Carell


100%|██████████| 72/72 [00:16<00:00,  4.36it/s]


For Taron Egerton


100%|██████████| 23/23 [00:04<00:00,  4.62it/s]


For Tye Sheridan


100%|██████████| 30/30 [00:08<00:00,  3.44it/s]


In [None]:
all_data = []
non_avenger_df = pd.DataFrame()
for file_path in glob.glob('non_avenger_*.json'):
  try:
    with open(file_path, 'r') as f:
      data = json.load(f)
      non_avenger_df=pd.concat([non_avenger_df,pd.DataFrame(data)],ignore_index=True)
  except FileNotFoundError:
    print(f"Error: File not found: {file_path}")
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in file: {file_path}")

In [None]:
non_avenger_df['imdb_votes'],non_avenger_df['imdb_id'],non_avenger_df['movie_ratings'] = zip(*list(non_avenger_df['title'].progress_apply(get_rotten_tomatoes_rating)))

100%|██████████| 314/314 [00:45<00:00,  6.91it/s]


In [None]:
# Apply and expand to new columns
ratings_expanded = non_avenger_df['movie_ratings'].apply(extract_ratings).apply(pd.Series)

ratings_expanded['Internet Movie Database'] = ratings_expanded['Internet Movie Database'].apply(convert_imdb)
ratings_expanded['Rotten Tomatoes'] = ratings_expanded['Rotten Tomatoes'].apply(convert_rt)
ratings_expanded['Metacritic'] = ratings_expanded['Metacritic'].apply(convert_mc)

# Merge into original dataframe
non_avenger_df_processed = non_avenger_df.drop(columns=['movie_ratings']).join(ratings_expanded)

In [None]:
non_avenger_df_processed.groupby('actor_name').agg({"budget":"mean","revenue":"mean","runtime":"mean","Internet Movie Database":"mean","Rotten Tomatoes":"mean","Metacritic":"mean"})

Unnamed: 0_level_0,budget,revenue,runtime,Internet Movie Database,Rotten Tomatoes,Metacritic
actor_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ben Foster,36132370.0,73654740.0,109.444444,6.651852,57.458333,55.36
Channing Tatum,54600000.0,168102000.0,111.6875,6.441667,59.8125,58.446809
Charlize Theron,76103450.0,230943500.0,111.724138,6.558621,58.068966,57.655172
Edward Norton,50046810.0,107025500.0,113.2,6.916667,65.04,63.916667
Jim Carrey,66762130.0,197049800.0,103.0,6.372222,50.5,52.352941
John David Washington,79160000.0,121102100.0,129.6,7.02,66.25,66.0
Matthew Goode,24662500.0,49256280.0,112.833333,6.781818,60.65,57.789474
Ryan Reynolds,63353270.0,201727800.0,107.681818,6.609091,50.604651,48.238095
Sam Worthington,61846150.0,272781800.0,116.5,6.242308,49.846154,51.636364
Steve Carell,48964100.0,239429000.0,103.846154,6.610256,62.641026,59.512821


In [None]:
non_avenger_df_processed['opening_weekend'] = non_avenger_df_processed.progress_apply(lambda x: get_opening_weekend_bom(x['imdb_id'],x['title']),axis=1)

  2%|▏         | 5/314 [00:02<02:59,  1.72it/s]

❌ IMDb ID not found for Lemony Snicket's A Series of Unfortunate Events


 82%|████████▏ | 258/314 [02:46<00:29,  1.88it/s]

❌ IMDb ID not found for South from Granada


100%|██████████| 314/314 [03:15<00:00,  1.60it/s]


In [None]:
non_avenger_df_processed['opening_weekend'] = non_avenger_df_processed['opening_weekend'].apply(parse_money)

In [None]:
non_avenger_df_processed1 = non_avenger_df_processed.copy()

In [None]:
mlb1 = MultiLabelBinarizer()
one_hot1 = pd.DataFrame(mlb1.fit_transform(non_avenger_df_processed1['genres']), columns=mlb1.classes_, index=non_avenger_df_processed1.index)

# Merge with original DataFrame
non_avenger_df_processed1 = pd.concat([non_avenger_df_processed1.drop(columns=['genres']), one_hot1], axis=1)

In [None]:
non_avenger_df_sorted = non_avenger_df_processed1.sort_values(['release_date'],ascending=True)

In [None]:
non_avenger_df_sorted['MCU'] = non_avenger_df_sorted.apply(lambda x: 0 if x['movie_id'] in movie_df['id'].unique() else 0,axis=1)

In [None]:
non_avenger_df_sorted = keep_first_one_only(non_avenger_df_sorted)

In [None]:
non_avenger_df_sorted.to_json("control_group_data_raw.json")