# Scraping 2021 movie releases

Marianne's code starts here

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import requests
import numpy as np

In [None]:
# This is the website where we pull the information for 2021 new releases
url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_2021'

# These headers will allow us to avoid a 403 error by mimicing a web browser
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

In [None]:
# format the request to mimic a web browser
r = requests.get(url, headers=header)

# import tables using pandas
tables = pd.read_html(r.text)

In [None]:
# read each of the quarterly tables into dataframes

q1 = tables[1]
q2 = tables[2]
q3 = tables[3]
q4 = tables[4]

In [None]:
# concatenate each of the quarterly movie tables into one dataframe
all_quarters = [q1, q2, q3, q4]

future_movies = pd.concat(all_quarters)

In [None]:
future_movies.shape

In [None]:
future_movies = future_movies.drop(columns=['Opening', 'Opening.1', 'Production company', 'Ref', 'Ref.'])

In [None]:
future_movies = future_movies[future_movies['Cast and crew'].notnull()]
future_movies.shape

In [None]:
# remove parentheses and all text between them
future_movies['Cast and crew'] = future_movies['Cast and crew'].str.replace(r"\(.*\)","")

# convert semicolon to comma for easier splitting
future_movies['Cast and crew'] = future_movies['Cast and crew'].str.replace('; ', ', ')

# view the results. Our dataframe now has only directors and actors and the title.
future_movies.head()

Marianne's code ends here

# Scraping genre for 2021 movie releases

Nicole's code starts here

In [None]:
import time

In [None]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

In [None]:
# Import Splinter and set the chromedriver path
from splinter import Browser
def init_browser():
    executable_path = {'executable_path': 'c:/bin/chromedriver'}
    return Browser('chrome', **executable_path, headless=False)

In [None]:
# Visit the URL & scrape
def scrape_imdb():
    browser = init_browser()
    imdb_url = 'https://www.imdb.com/list/ls070080072/'
    browser.visit(imdb_url)
    time.sleep(2)
    soup = BeautifulSoup(browser.html, 'html.parser')
    results = soup.find_all('div', class_='lister-item-content')
    browser.quit()
    return results

In [None]:
# Scrape Title
title_list = []
for result in scrape_imdb():
    title1 = result.find('h3')
    title = title1.a.text
    title_list.append(title)
title_list

In [None]:
# Scrape Genre (need to use Try/Except as some of the values are blank)
genre_list = []
for result in scrape_imdb():
    try:
        genre1 = result.find('span', class_='genre')
        genre = genre1.text.replace('', '').strip()
        genre_list.append(genre)
    except:
        genre = 'Null'
        genre_list.append(genre)
genre_list

In [None]:
# Visualze Data
movie2021 = pd.DataFrame({'Title': title_list, 'genre': genre_list})
print(movie2021.shape)
movie2021.head()

Marianne's code starts here

In [None]:
# merge the two dataframes together
all_data = future_movies.set_index('Title').join(movie2021.set_index('Title'))

In [None]:
all_data = all_data[all_data['genre'].notnull()]
all_data.shape

In [None]:
all_data.head()

# Preparing the data for the neural network

In order to work with our neural network, the data needs to be fitted to the same columns as used when training the model.

In [None]:
# read in the data used to train/test the model
movies = pd.read_csv('moviesClean.csv')

In [None]:
# clear the old data from the dataframe
empty_movies = movies[0:0]
empty_movies.head()

In [None]:
# remove columns that the model isn't expecting
empty_movies.drop(columns=['original_title', 'genre', 'revenue_percent', 'budget', 'metascore',
                     'worlwide_gross_income', 'director', 'actors', 'success'], inplace=True)
empty_movies.head()

In [None]:
# create our new dataframe for predictions
predictions = empty_movies

In [None]:
# add our genre and cast and crew columns to the dataframe
# because title is the index, it will automatically be added.
predictions['genre'] = all_data['genre']
predictions['Cast and crew'] = all_data['Cast and crew']

In [None]:
predictions.head()

In [None]:
# replace all na values with 0
predictions = predictions.fillna(0)

In [None]:
predictions['year'] = 2021
predictions.head()

Update columns to a value of "1" if actors or directors are present.

In [None]:
# define empty dictionary for the cast and crew data
cast_crew_dict = {}

In [None]:
# Walk down the dataframe, movie by movie ...
for index, row in predictions.iterrows():
    
    # read the list of actors, splitting them at the comma 
    cast_crew_names = row['Cast and crew'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for name in cast_crew_names:
        
        # remove any whitespace from the name
        name = name.strip()
        
        # if the person is already in the dictionary then
        # simply increase the count. Otherwise, add the person
        # and set the count to 1. 
        if name in cast_crew_dict:
            cast_crew_dict[name] += 1
        else:
            cast_crew_dict[name] = 1
          

In [None]:
# Count the total number of actors found
total_cast_crew = len(cast_crew_dict)
print(f"Found a total of {total_cast_crew} cast and crew")

In [None]:
# update the column to '1' if the person in the column made this movie

# Walk down the dataframe, movie by movie ...
for index, row in predictions.iterrows():
    
    # read the list of actors 
    cast_crew_names = row['Cast and crew'].split(',')
    
    # for each actor in the list (up to the maximum desired)
    for name in cast_crew_names:
                
        # remove any whitespace from the name
        name = name.strip()
                
        # then indicate that the actor starred in this movie
        predictions.loc[index, name] = 1

In [None]:
predictions.head()

In [None]:
# drop any actors/directors that the model isn't trained for
predictions = predictions.dropna(axis=1)

In [None]:
predictions.head()

In [None]:
# check for a specific actor
predictions['Tom Cruise'].value_counts()

In [None]:
predictions['Scarlett Johansson'].value_counts()

Update the genre for each movie

In [None]:
# define an empty dictionary for tracking genres
genre_dictionary = {}

In [None]:
# go through the dataframe row by row
for index, row in predictions.iterrows():
    
#     read the list of genres and split at the comma
    genre_names = row['genre'].split(',')   
    
#     for each genre in the list
    for genre in genre_names:
        
#     remove whitespace from the name
        genre = genre.strip()
    
#     if the genre is in the dictionary, increase the count. 
#     otherwise, add the genre and set the count to one
        if genre in genre_dictionary:
            genre_dictionary[genre] += 1
        else: 
            genre_dictionary[genre] = 1

In [None]:
# count the total number of genres found
total_genres = len(genre_dictionary)
print(f"Found a total of {total_genres} genres")

In [None]:
# update for each movie
for index, row in predictions.iterrows():
    genre_names = row['genre'].split(',')
    
#     for each genre in the list
    for genre in genre_names:
        genre = genre.strip()
        
        predictions.loc[index, genre] = 1

In [None]:
predictions = predictions.drop(columns=['genre', 'Cast and crew'])

In [None]:
predictions.head()

In [None]:
# save to file for use with the machine learning model
predictions.to_csv('2021_predictions.csv', index=True)