In [3]:
# Import libraries:
from bs4 import BeautifulSoup
import requests
import re

In [4]:
# Load dataset

import pandas as pd

pd.set_option('display.max_columns', None)

df = pd.read_csv(r'C:\Users\ADMIN\Desktop\python\project\data\movie_dataset.csv', encoding='UTF-8', engine='python', index_col='index')

In [5]:
# Decode unicode characters
import unicodedata

for i in df.columns:
    if df[i].dtypes == 'object':
        df[i] = df[i].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode() if type(x) == str else x)

In [6]:
# Extract missing budget-revenue information

df_missing = df[['id', 'budget', 'title', 'revenue', 'release_date']][(df['budget'] == 0) | (df['revenue'] == 0)]
df_missing['year'] = df_missing['release_date'].apply(lambda x: x[:4] if type(x) != float else x)
df_missing.dropna(inplace=True)

In [7]:
# Define crawling functions:
def sub_link(title):
    rx = re.compile('[.\',:]')
    title = rx.sub('', title)
    title = title.replace('&', 'and')
    title_split = title.split(' ')
    if title_split[0].upper() in ["THE", "A"]:
        link = '-'.join(title_split[1:] + [title_split[0]])
    else:
        link = '-'.join(title_split)
    
    return link


def sub_link_with_year(title, year):
    rx = re.compile('[.\',:]')
    title = rx.sub('', title)
    title = title.replace('&', 'and')
    title_split = title.split(' ')
    if title_split[0].upper() in ["THE", "A"]:
        sub_link_with_year = '-'.join(title_split[1:] + [title_split[0]] + ['(' + year + ')'])
    else:
        sub_link_with_year = '-'.join(title_split + ['(' + year + ')'])
    
    return sub_link_with_year


def the_numbers_crawl(id, title, year, sub_link):
    
    url = "https://www.the-numbers.com/movie/{}#tab=summary".format(sub_link)
    headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

    res = requests.request("GET", url, headers=headers)
    
    soup = BeautifulSoup(res.text, 'html.parser')
    
    summary = soup.select('tr > td')
    
    budget = 0
    
    revenue = 0

    for idx, s in enumerate(summary):
        if s.text in ['Worldwide Box Office', 'Domestic Box Office']:
            text = summary[idx+1].text.replace(',', '').replace('$', '')
            try:
                revenue = int(text.split(" ")[0])
            except:
                continue

        if s.text == 'Production\xa0Budget:':
            text = summary[idx+1].text.replace(',', '').replace('$', '')
            try:
                budget = int(text.split(" ")[0])
            except:
                continue
            break
    
    return {'id': id,
            'title': title,
            'year' : year,
            'budget': budget,
            'revenue': revenue}

In [8]:
result = pd.DataFrame(columns = ['id', 'title', 'year', 'budget', 'revenue'])

In [11]:
for i in df_missing.index:
    title = df_missing.loc[i, 'title']
    year = df_missing.loc[i, 'year']
    link = sub_link(title)
    crawl_data = the_numbers_crawl(i, title, year, link)
    if crawl_data['budget'] == 0 and crawl_data['revenue'] == 0: 
        title = df_missing.loc[i, 'title']
        year = df_missing.loc[i, 'year']
        link = sub_link_with_year(title, year)
        result = result.append(the_numbers_crawl(i, title, year, link), ignore_index=True)
    else:
        result = result.append(the_numbers_crawl(i, title, year, link), ignore_index=True)

In [12]:
# Export to a flat file:

result.to_csv('crawl.csv', index=False)