# Part 1 - Web Scraping

This notebook holds the function to scrape details from 10,000 movies links listed as "Musical" on IMDB.

## For the first part of my project, I built a function to scrape a web page featuring 10,000 movie links for the data contained within the page (see README.md for more info) 

In [None]:
# when we make this into a function, got to account for duplicates - if the URL has already been added, PASS.
# started at 10/1/20, 11:55pm. ended at 10/2/20, 2:21am (roughly 2.5 hrs) - 9950 movies (first 50 already done)


import requests as requests
import re
from bs4 import BeautifulSoup
import numpy as np

# Note: only including first 10k results, the last ~200 results were not adapted to or from stage 

# big_url_list refers to the list of lists that include 50 movies of a certain genre
per_50_url_list = []

for i in np.arange(0,200):
    # per_50_url refers to the the individual page displaying 50 movies, of the full list of movies per genre
    per_50_url = 'https://www.imdb.com/search/title/?title_type=movie&genres=musical&start=' + str(((i)*50)+1) + '&explore=title_type,genres&ref_=adv_nxt'
    per_50_url_list.append(per_50_url)
    
# master_url_list is list of URL's, organized by lists of 50 (len(master_url_list==199))...
# and each element in that list is a sublist with 50 entries (9950 total)
master_url_list = []

for ind_per_50_url in per_50_url_list:
    # master_url_list refers to the list of ALL URL's of ALL movies in a genre
        demi_master_url_list = scrape_urls_from_50_per_page(ind_per_50_url)
        master_url_list.append(demi_master_url_list)

# page_title_list is a list of Titles for each movie/URL 
page_title_list = []
# page_runtime_list is a list of Runtimes (in minutes) for each movie/URL 
page_runtime_list = []
# page_stars_list is a list of "Star(s):" entry for each movie/URL
page_stars_list = []
# page_directors_list is a list of "Director(s):" entry for each movie/URL
page_directors_list = []
# ...etc
page_release_list = []
page_location_list = []
page_aka_list = []
page_awards_list = []
page_genres_list = []
page_budget_list = []
page_grossUSA_list = []
page_worldwide_list = []
page_rating_list = []


# ind_master_url_list is the list of all unique URL's in this genre 
ind_master_url_list = []

# separates URL's from sublist in master_url_list to individual URL's
for fifty_list in master_url_list:
    for i, num in enumerate(fifty_list):
        ind_master_url_list.append(fifty_list[i])


# requests soup for each individual page in ind_master_url_list, and extracts all needed values
for movie_page in ind_master_url_list:
    movie = requests.get(movie_page)
    movie_soup = BeautifulSoup(movie.text, 'html5lib')
    
    # TITLE
    try:
        title = movie_soup.find_all('title')[0]
        title = str(title).split('<title>')
        title = title[1].split(' (')
        title = title[0]
        page_title_list.append(title)
    except: 
        page_title_list.append('Not found')
        
    # RUNTIME
        
    try:
        # checks if runtime reads hours only (i.e. '2h', representing 2 hours)
        if len(runtime) == 2:
            runtime_hours = runtime[0].split()
            runtime_minutes = ['0']

        # checks if runtime reads minutes only in single digits (i.e. '9min', representing 9 minutes)
        elif len(runtime) == 4:
            runtime_hours = ['0']
            runtime_minutes = runtime[0].split()

        # checks if runtime reads minutes only in double digits (i.e. '45 min', representing 45 minutes)
        elif len(runtime) == 5:
            runtime_hours = ['0']
            runtime_minutes = runtime[0:2].split()

        # checks if runtime reads hours and minutes (i.e. 2h 45min, representing 2 hours and 45 minutes)
        elif len(runtime) > 5:
            runtime_hours = runtime[0].split()
            if runtime[5] == 'm':
                runtime_minutes = runtime[3:5].split()
            else:
                runtime_minutes = runtime[3].split()
                
        # converts strings to integers and calculates total minutes for final runtime variable 
        int_hours = int(runtime_hours[0])
        int_minutes = int(runtime_minutes[0])
        total_minutes = (int_hours*60) + int_minutes
        total_minutes
        page_runtime_list.append(total_minutes)
    except:
        page_runtime_list.append('Not found')
        


    # STAR(S)
    # try/except accounts for any page with missing values for 'Star(s):'
    try:
        stars = movie_soup.find('h4', text=re.compile('Star'))
        stars = stars.find_parent('div')
        # appropriate text transformation...
        stars = str(stars).split('</h4>')
        stars = stars[1].split('<span')[0]
        stars = stars.split('</div>')[0].strip()
        stars = stars.split('">')
        stars_list = []
        # splits sublist of stars ['['a','b','c']'] into individual values ['a', 'b', 'c']
        for i, actor in enumerate(stars):
            actor = stars[i].split('</a>')[0]
            stars_list.append(actor)

        # removes irrelevant value (leading text before first Star name)
        stars_list.remove(stars_list[0])
        page_stars_list.append(stars_list)
    except:
        page_stars_list.append('Not found')

    
    # DIRECTOR(S)
    # try/except accounts for any page with missing values for 'Director(s):' (pattern continues through features)
    try:
        directors = movie_soup.find('h4', text=re.compile('Director'))
        directors = directors.find_parent('div')
        directors = str(directors).split('</h4>')
        directors = directors[1].split('<span')[0]
        directors = directors.split('</div>')[0].strip()
        directors = directors.split('">')
        directors_list = []
        for i, director in enumerate(directors):
            director = directors[i].split('</a>')[0]
            directors_list.append(director)

        directors_list.remove(directors_list[0])
        page_directors_list.append(directors_list)
    except:
        page_directors_list.append('Not found')

        
    # RELEASE DATE 
    try:
        release = movie_soup.find('h4', text='Release Date:')
        release = release.find_parent('div')
        release = str(release).split('</h4>')
        release = release[1].split('<span')[0]
        release = release.split('</div>')[0].strip()
        page_release_list.append(release)
    except:
        page_release_list.append('Not found')
    

    # FILMING LOCATION
    try:
        location = movie_soup.find('h4', text='Filming Locations:')
        location = location.find_parent('div')
        location = str(location).split('</h4>')
        location = location[1].split('<span')[0]
        location = location.split('</div>')[0].strip()
        location = location.split('">')
        location = location[1].split('</a>')[0]
        page_location_list.append(location)
    except:
        page_location_list.append('Not found')
    
    # ALSO KNOWN AS
    # helpful feature, based on title changes between adaptations
    try:
        aka = movie_soup.find('h4', text='Also Known As:')
        aka = aka.find_parent('div')
        aka = str(aka).split('</h4>')
        aka = aka[1].split('<span')[0]
        aka = aka.split('</div>')[0].strip()
        page_aka_list.append(aka)
    except:
        page_aka_list.append('Not found')
    
    # OSCAR NOMINATION
    # this number represents number of Oscar nominations
    try:
        awards = movie_soup
        awards = movie_soup.find('span', class_='awards-blurb')
        awards = str(awards).split('Nominated for')
        awards = awards[1].split('\n')
        awards = awards[1].strip()
        page_awards_list.append(awards)
    except:
        page_awards_list.append('None')
    
    
    # GENRE
    # returns a list of genres (as multiple films have multiple genres)
    genre = movie_soup.find_all('script', type="application/ld+json")

    genre = str(genre).split("genre")
    genre = genre[1].split("  ]")
    genre = genre[0].split("\n")
    genre.remove(genre[0])
    genre.remove(genre[(len(genre)-1)])
    genre_list = []
    for entry in genre:
        entry = entry.strip()
        entry = entry.replace('\"', '')
        entry = entry.replace(',', '')
        genre_list.append(entry)

    page_genres_list.append(genre_list)
    
    
    # BUDGET
    try:
        budget_value = movie_soup
        budget_value = movie_soup.find('h4', text='Budget:')
        budget_value = budget_value.find_parent('div')
        budget_value = str(budget_value).split('</h4>')
        budget_value = budget_value[1].split('<span')[0]
        page_budget_list.append(budget_value)
    except:
        page_budget_list.append('Not found')
    

    # GROSS USA - aka domestic gross
    try:
        grossUSA = movie_soup.find('h4', text='Gross USA:')
        grossUSA = grossUSA.find_parent('div')
        grossUSA = str(grossUSA).split('</h4>')
        grossUSA = grossUSA[1].split('<span')[0]
        grossUSA = grossUSA.split('</div>')[0].strip()
        page_grossUSA_list.append(grossUSA)
    except:
        page_grossUSA_list.append('Not found')
    
    
    # WORLDWIDE GROSS 
    try:
        worldwide = movie_soup.find('h4', text='Cumulative Worldwide Gross:')
        worldwide = worldwide.find_parent('div')
        worldwide = str(worldwide).split('</h4>')
        worldwide = worldwide[1].split('<span')[0]
        worldwide = worldwide.split('</div>')[0].strip()
        page_worldwide_list.append(worldwide)
    except:
        page_worldwide_list.append('Not found')
    
    
    # Find MPAA rating
    try:
        rating_value = movie_soup
        rating = movie_soup.find('div', class_='subtext').text
        rating = rating.split('\n')
        rating = rating[1].strip()
        page_rating_list.append(rating)
    except:
        page_rating_list.append("Not found")
        


## Within the function above is a function to scrape URL's from these master pages of 50 URL's or more

In [125]:
def scrape_urls_from_50_per_page(url):
    # retries larger page, listing 50 movies at a time
    page = requests.get(url)
    page_soup = BeautifulSoup(page.text, 'html5lib')


    # Create list for storage of URL and title coorelated
    title_list = []
    url_list = []

    # Function to extract URL (and title) from this page
    for i in np.arange(0,50):
        values = page_soup.find_all('h3', class_="lister-item-header")[i]
        values = str(values).split('a href=')
        info = values[1].split('>')
        url=info[0]
        url = url.strip('"')
        title=(info[1].split('<'))[0]
        url_list.append(url)
        title_list.append(title)
        
    # for url in page_url_list
        
    page_url_list = []

    for url in url_list:
        base_url = 'https://www.imdb.com/'
        full_url = base_url+url
        # movie_page = requests.get(full_url)
        page_url_list.append(full_url)
        
    return page_url_list

In [47]:
# INDIVIDUAL PAGE - SCRAPE URL EXTENSION

values = page_soup.find_all('h3', class_="lister-item-header")[1]
values = str(values).split('a href=')
info = values[1].split('>')
url=info[0]
url = url.strip('"')
title=(info[1].split('<'))[0]

url

'/title/tt8503618/'

## This data led to two scraped data sets, which led to the second notebook, 2_Table_Merge_and_Feature_Engineering

In [5]:
import pandas as pd

musical_movies = pd.read_csv('ORIGINAL_musical_movies_10520.csv')
musical_movies['new_scrape'] = musical_movies['FILM_mpaa_rating'].isnull()
new_scrape = musical_movies[musical_movies['new_scrape'] == True]
tconst = new_scrape['FILM_tconst']


In [7]:
tconst_list = tconst

import requests as requests
import re
from bs4 import BeautifulSoup
import numpy as np
from scrape_urls_from_50_movie_list_page import scrape_urls_from_50_movie_list_page
import pandas as pd

url_list = []

for page in tconst_list:
    url = 'https://www.imdb.com/title/' + page
    url_list.append(url)
# ...etc
page_release_list = []

    # requests soup for each individual page in ind_master_url_list, and extracts all needed features
for movie_page in url_list:
    movie = requests.get(movie_page)
    movie_soup = BeautifulSoup(movie.text, 'html5lib')

        # RELEASE DATE 
    try:
        release = movie_soup.find('h4', text='Release Date:')
        release = release.find_parent('div')
        release = str(release).split('</h4>')
        release = release[1].split('<span')[0]
        release = release.split('</div>')[0].strip()
        page_release_list.append(release)
    except:
        page_release_list.append('Not found')
        
page_release_list

['18 October 1961 (USA)',
 '10 December 2021 (USA)',
 '25 November 1992 (USA)',
 'Not found',
 '25 December 2000 (UK)',
 '25 December 1952 (Denmark)',
 '1967 (USA)',
 '24 December 1974 (UK)',
 '6 February 2020 (Egypt)',
 '1963 (Australia)',
 '26 December 1951 (UK)',
 '26 December 2001 (UK)',
 '26 December 1957 (UK)',
 '31 December 1960 (UK)',
 '24 May 2019 (USA)',
 '25 December 1971 (UK)',
 '2017 (UK)',
 '4 July 2020 (USA)',
 '23 December 1927 (USA)',
 '24 January 2003 (USA)',
 '29 April 2011 (Germany)',
 'Not found',
 'Not found',
 '2 May 2019 (USA)',
 '18 June 1982 (USA)',
 '19 December 2014 (USA)',
 'Not found',
 '15 November 2015 (USA)',
 'Not found',
 '28 December 1914 (USA)',
 '4 March 1950 (USA)',
 '1963 (UK)',
 '22 February 1965 (USA)',
 'May 1977 (USA)',
 '2 November 1997 (USA)',
 '31 March 1957 (USA)',
 '1 July 1937 (France)',
 '17 December 2001 (USA)',
 '2 January 2000 (UK)',
 '25 December 1970 (UK)',
 '1974 (Philippines)',
 '2018 (USA)',
 '5 February 2021 (USA)',
 '23 Decem

### Save df as csv

In [10]:
df.to_csv('ORIGINAL.new_scrape_release_dates.csv')