In [1]:
import pandas as pd
from sqlalchemy import create_engine
import requests
import numpy as np

In [2]:
# This is the website where we pull the information for 2021 new releases
url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_2021'

# These headers will allow us to avoid a 403 error by mimicing a web browser
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

In [3]:
# format the request to mimic a web browser
r = requests.get(url, headers=header)

# import tables using pandas
tables = pd.read_html(r.text)

In [4]:
# read each of the quarterly tables into dataframes

q1 = tables[1]
q2 = tables[2]
q3 = tables[3]
q4 = tables[4]

In [5]:
# concatenate each of the quarterly movie tables into one dataframe
all_quarters = [q1, q2, q3, q4]

future_movies = pd.concat(all_quarters)

In [6]:
future_movies.shape

(95, 7)

In [7]:
future_movies.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref,Ref.
0,JANUARY,1,We Can Be Heroes,Netflix / Troublemaker Studios,Robert Rodriguez (director/screenplay); Priyan...,[1],
1,JANUARY,8,The Devil's Light,Lionsgate,Daniel Stamm (director); Robert Zappia (screen...,[2],
2,JANUARY,15,Wrath of Man,Metro-Goldwyn-Mayer / Miramax,Guy Ritchie (director/screenplay); Jason Stath...,[3],
3,JANUARY,22,Chaos Walking,Lionsgate,"Doug Liman (director); Patrick Ness, Christoph...",[4],
4,JANUARY,22,Nine Days,Sony Pictures Classics,"Edson Oda (director/screenplay); Winston Duke,...",[5],


In [8]:
future_movies = future_movies[future_movies['Cast and crew'].notnull()]
future_movies.shape

(94, 7)

In [9]:
# remove parentheses and all text between them
future_movies['Cast and crew'] = future_movies['Cast and crew'].str.replace(r"\(.*\)","")

# convert semicolon to comma for easier splitting
future_movies['Cast and crew'] = future_movies['Cast and crew'].str.replace('; ', ', ')

# view the results
future_movies.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref,Ref.
0,JANUARY,1,We Can Be Heroes,Netflix / Troublemaker Studios,"Robert Rodriguez , Priyanka Chopra, Pedro Pasc...",[1],
1,JANUARY,8,The Devil's Light,Lionsgate,"Daniel Stamm , Virginia Madsen, Ben Cross, Col...",[2],
2,JANUARY,15,Wrath of Man,Metro-Goldwyn-Mayer / Miramax,"Guy Ritchie , Jason Statham, Holt McCallany, S...",[3],
3,JANUARY,22,Chaos Walking,Lionsgate,"Doug Liman , Daisy Ridley, Tom Holland, Mads M...",[4],
4,JANUARY,22,Nine Days,Sony Pictures Classics,"Edson Oda , Winston Duke, Zazie Beetz, Benedic...",[5],


In [10]:
# define empty dictionary for the cast and crew data
cast_crew_dict = {}

In [11]:
# Walk down the dataframe, movie by movie 
for index, row in future_movies.iterrows():
    
    # read the list of actors, splitting them at the comma 
    crew_names = row['Cast and crew'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for name in crew_names:
        
        # remove any whitespace from the name
        name = name.strip()
        
        # if the actor is already in the dictionary then
        # simply increase the count. Otherwise, add the actor
        # and set the count to 1. 
        if name in cast_crew_dict:
            cast_crew_dict[name] += 1
        else:
            cast_crew_dict[name] = 1

In [12]:
# Count the total number of actors found
total_cast_crew = len(cast_crew_dict)
print(f"Found a total of {total_cast_crew} cast and crew")

Found a total of 618 cast and crew


In [13]:
# Add one new column to the dataframe for each actor found,
# and initialize that new column with 0s.
for cast in cast_crew_dict:
    future_movies[cast] = np.zeros(future_movies.shape[0])

In [14]:
# Walk down the dataframe, movie by movie ...
for index, row in future_movies.iterrows():
    
    # read the list of actors 
    crew_names = row['Cast and crew'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for crew in crew_names: 
                
        # remove any whitespace from the name
        crew = crew.strip()
                
        # then indicate that the actor starred in this movie
        future_movies.loc[index, crew] = 1

In [15]:
future_movies['Tom Cruise'].value_counts()

0.0    86
1.0     8
Name: Tom Cruise, dtype: int64

In [19]:
future_movies = future_movies.drop(columns=['Opening', 'Opening.1', 'Production company', 'Cast and crew', 'Ref', 'Ref.'])
future_movies.head()

Unnamed: 0,Title,Robert Rodriguez,Priyanka Chopra,Pedro Pascal,YaYa Gosselin,Sung Kang,Boyd Holbrook,Taylor Dooley,J.J. Dashnaw,Christian Slater,...,Matthew McConaughey,Reese Witherspoon,Seth MacFarlane,Taron Egerton,Tori Kelly,Mélanie Laurent,Dakota Fanning,Elle Fanning,Damien Chazelle,Brad Pitt
0,We Can Be Heroes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,The Devil's Light,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Wrath of Man,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Chaos Walking,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Nine Days,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
future_movies.to_csv('testing2021.csv', index=False)

In [None]:
# This is the website where we pull the information for 2021 new releases
new_url = 'https://www.imdb.com/list/ls093883812/?sort=list_order,asc&st_dt=&mode=detail&page=1&ref_=ttls_vm_dtl'

# These headers will allow us to avoid a 403 error by mimicing a web browser
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

In [None]:
# format the request to mimic a web browser
r2 = requests.get(new_url, headers=header)

# import tables using pandas
newtables = pd.read_html(r2.text)

In [None]:
newtables[1]