 # Movies Data Scraper

 This notebook scrapes data about movies from the site https://www.the-numbers.com/ using Python 3.
 
 Also, it uploads it into Data Frame using only relevant data.
 
 Lastly, the Data Frame is downloaded as Pickle.

In [None]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import datetime
import csv
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

First scraping The data from every movie page (more then 16k pages) and saving each page as a local HTML file.

In [None]:
def scrape_all_movies_by_year(year, DIR):
    url = f"https://www.the-numbers.com/market/{year}/top-grossing-movies"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser', from_encoding='utf-8')
    soup = soup.find_all('tr')
    soup.pop(0)
    soup.pop(-1)
    soup.pop(-1)
    
    year_dir = f"{DIR}\\{year}"
    os.makedirs(year_dir)
    
    for item in soup:
        scrape_movie_info(item, year_dir)
        
        
def scrape_movie_info(item, year_dir):
    movie_url = f"https://www.the-numbers.com{item.a['href']}"
    movie_r = requests.get(movie_url)
    movie_soup = BeautifulSoup(movie_r.content, 'html.parser', from_encoding='utf-8')

    with open(f'{year_dir}\\{item.td.text}.txt','w',  encoding="utf-8") as f:
        f.write(str(movie_soup))
        
        

DIR = f"{os.path.abspath(os.path.curdir)}\\movies_data" 

years = range(1995, 2022)    
for year in years:
    scrape_all_movies_by_year(year, DIR)    


Parsing the downloaded HTML files and inserting the results into a Data Frame.

In [None]:
DIR = f"{os.path.abspath(os.path.curdir)}\\movies_data"

def get_soup(year, file):
    with open(f"{DIR}\\{year}\\{file}", 'r' , encoding='utf-8') as g:
        file_txt = g.read()
        file_txt = file_txt.replace("\xa0", " ")
        return BeautifulSoup(file_txt, 'html.parser', from_encoding='utf-8')

def get_title(soup):
    return soup.div.h1.text.split(r' (')[0]

def table_to_dict(table):
    row_list = table.find_all('tr')
    cols_by_row = [row.find_all('td') for row in row_list]

    result = {}
    for row in cols_by_row:
        if len(row) >= 2:
            key = row[0].text
            value = row[1].text
            #print(key,value)
            result[key] = value
    return result

def get_table_by_title(soup, title):
    title_element = soup.find_all("h2", string=title)[0]

    for sibling in title_element.next_siblings:
        # Some of the siblings are not a Tag, but a NavigableString. Filter them out.
        if isinstance(sibling, Tag):
            # Sometimes the next table is the direct next sibling of the title element,
            # and sometimes, it's nested in one of its next siblings.
            candidate_tables = [sibling] if sibling.name == "table" else []
            candidate_tables += sibling.find_all("table")
            for candidate_table in candidate_tables:
                # filter out weird empty tables that have no content
                # (e.g after "Metrics" there is a weird table "movie_ratings")
                if candidate_table.find("td"):
                    return candidate_table
    raise Exception(f"Could not find {title} table")
        

def get_financial_details_table(soup):
    return soup.find_all('table', id="movie_finances")[0]

def get_metrics_table(soup):
    return get_table_by_title(soup, "Metrics")

def get_more_details_table(soup):
    return get_table_by_title(soup, "Movie Details")

In [None]:
df_movies = pd.DataFrame()

for year in range(1995, 2022):
    print(f"Year: {year}")
    for file in os.listdir(f"{DIR}\\{year}\\"):
        soup = get_soup(year, file)
            
        #we will create dictionary for every movie that only has title in the start:
        movie_dict={ 'title': get_title(soup)}

        #uploading table with financial data:
        financial_data_dict = table_to_dict(get_financial_details_table(soup))
        movie_dict.update(financial_data_dict)

        #uploading table with metrics:
        metrics_dict = table_to_dict(get_metrics_table(soup))
        movie_dict.update(metrics_dict)

        #uploading table with more info:
        more_details_dict = table_to_dict(get_more_details_table(soup))
        movie_dict.update(more_details_dict)
        
        df_movies = df_movies.append(movie_dict, ignore_index=True)
        
    

df_movies.head()

Lastly, performing initial cleaning of the data and saving the Movies Data Frame locally as a Pickle file.

In [None]:
#cleaning column names
df_movies.columns = df_movies.columns.str.strip(r':')
#saving to pickle
df_movies.to_pickle(f'{os.path.abspath(os.path.curdir)}\movies.pkl')