In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

In [3]:
# IMDB's homepage
imdb_url = 'https://www.imdb.com'

# Use requests to retrieve data from a given URL
imdb_response = requests.get(imdb_url)

# Parse the whole HTML page using BeautifulSoup
imdb_soup = BeautifulSoup(imdb_response.text, 'html.parser')

# Title of the parsed page
imdb_soup.title.text

'IMDb: Ratings, Reviews, and Where to Watch the Best Movies & TV Shows'

In [19]:
# Find all links
links = [link.get('href') for link in imdb_soup.find_all('a')]

# Add homepage and keep the unique links
fixed_links = set([''.join([imdb_url, link]) for link in links if link])


unique_links = []
for link in links:
    if not link in unique_links:
        unique_links.append(imdb_url + link)

In [29]:
# Box Office Mojo - UK Weekend box office
boxofficemojo_url = 'https://www.boxofficemojo.com/intl/uk/?yr=2019&wk=33&currency=local'

# Use requests to retrieve data from a given URL
bom_response = requests.get(boxofficemojo_url)

# Parse the whole HTML page using BeautifulSoup
bom_soup = BeautifulSoup(bom_response.text, 'html.parser')

In [31]:
print(f"NUMBER OF TABLES IN THE PAGE: {len(bom_soup.find_all('table'))}")

table = bom_soup.find_all('table')[0]

table

NUMBER OF TABLES IN THE PAGE: 1


<table class="a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated"><tr><th class="a-text-left mojo-field-type-date_interval mojo-sort-column mojo-sortable-column a-nowrap"><a class="a-link-normal a-nowrap" href="?area=GB&amp;sort=startDate&amp;sortDir=asc&amp;ref_=bo_wey__resort#table" title="Dates"><span class="a-color-state">Dates</span><span class="a-letter-space"></span><span class="icon aok-relative"><i class="a-icon a-icon-expand" role="presentation"></i></span></a></th><th class="a-text-right mojo-field-type-money mojo-sortable-column a-nowrap"><a class="a-link-normal a-nowrap" href="?area=GB&amp;sort=top10Gross&amp;ref_=bo_wey__resort#table" title="Top 10 Gross">Top 10 Gross<span class="a-letter-space"></span><span class="icon aok-relative"><i class="a-icon a-icon-expand table-sort-desc-placeholder" role="presentation"></i><i class="a-icon a-icon-collapse table-sort-asc-placeholder" role="presentation"></i></span></a></th><th class="a-text-

In [32]:
table.find_all('tr')[0].contents

[<th class="a-text-left mojo-field-type-date_interval mojo-sort-column mojo-sortable-column a-nowrap"><a class="a-link-normal a-nowrap" href="?area=GB&amp;sort=startDate&amp;sortDir=asc&amp;ref_=bo_wey__resort#table" title="Dates"><span class="a-color-state">Dates</span><span class="a-letter-space"></span><span class="icon aok-relative"><i class="a-icon a-icon-expand" role="presentation"></i></span></a></th>,
 <th class="a-text-right mojo-field-type-money mojo-sortable-column a-nowrap"><a class="a-link-normal a-nowrap" href="?area=GB&amp;sort=top10Gross&amp;ref_=bo_wey__resort#table" title="Top 10 Gross">Top 10 Gross<span class="a-letter-space"></span><span class="icon aok-relative"><i class="a-icon a-icon-expand table-sort-desc-placeholder" role="presentation"></i><i class="a-icon a-icon-collapse table-sort-asc-placeholder" role="presentation"></i></span></a></th>,
 <th class="a-text-right mojo-field-type-percent_delta mojo-sortable-column a-nowrap"><a class="a-link-normal a-nowrap" h

In [34]:
table.find_all('tr')[0].text.split('\n')

['DatesTop 10 Gross%± LWOverall Gross%± LWReleases#1 Release',
 'Genre',
 'Budget',
 'Running Time',
 'WeekLong Weekend',
 '']

In [36]:
lst = []
for row in table.find_all('tr')[1:]:
    s = pd.Series([data.text for data in row.find_all('td')])
    lst.append(s)
    
data = pd.concat(lst, axis=1).T

data.head(2)

print(f'(MOVIES, COLUMNS) -> {data.shape}')

print(f'% OF MISSING VALUES PER COLUMN\n{(data.isnull().sum() / data.shape[0]) * 100}')

(MOVIES, COLUMNS) -> (52, 12)
% OF MISSING VALUES PER COLUMN
0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
dtype: float64
