In [1]:
#import the required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#set the browser to receive the results in English
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,hu;q=0.7",
    "Referer": "https://google.com",
    "DNT": "1"
}

In [3]:
#load the target URL
url = 'https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc'
response = requests.get(url, headers=headers)

In [4]:
#parse the HTML text into searchable text
soup = BeautifulSoup(response.text, 'html.parser')
movie_containers = soup.find_all('div', class_='lister-item-content')

In [5]:
# extract movie titles and URL's
top_20_urls = []
top_20_names = []
for movie in movie_containers [0:20]:
    header = movie.find('h3', class_='lister-item-header')
    title = header.a.text
    movie_url = 'https://www.imdb.com' + header.a['href']
    top_20_urls.append(movie_url)
    top_20_names.append(title)

In [6]:
# extract ratings of the movies
top_20_rating = []
for movie in movie_containers [0:20]:
    rating = float(movie.find('div', class_='inline-block ratings-imdb-rating').text.strip())
    top_20_rating.append(rating)

In [7]:
# extract number of raters
top_20_rater = []
for movie in movie_containers [0:20]:
    num_raters = movie.find('span', attrs={'name': 'nv'}).text
    num_raters = int(num_raters.replace(',', ''))
    top_20_rater.append(num_raters)

In [8]:
#create the function which locates the award's tab of the movie and find out how many wins are there, based on the number of lines of the table.
#later it may be extended with the nominations as well, as it could indicate value as well.
def get_num_oscars(url):
    # Get the new page of the URL
    award_url = url.replace("?ref_=adv_li_tt", "awards")

    # Get the HTML content of the URL
    response = requests.get(award_url, headers=headers)
    html_content = response.text

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the award items with category "Oscar" and outcome "Winner"
    oscar_winners = soup.find_all('td', class_='title_award_outcome')
    oscar_winners = [ow for ow in oscar_winners if ow.find('span', class_='award_category').text == 'Oscar' and ow.find('b').text == 'Winner']

    # Get the number of Oscars won by the movie
    if len(oscar_winners) > 0:
        num_oscars = int(oscar_winners[0]['rowspan'])
    else:
        num_oscars = 0

    return num_oscars

In [9]:
#use the oscar finder funcion and store the actual values
top_20_oscar = []
for url in top_20_urls:
    oscarnumber = get_num_oscars(url)
    top_20_oscar.append(oscarnumber)

In [10]:
#calculate the bonus points for the won oscars
oscar_points = []
for oscar in top_20_oscar [0:20]:
    if oscar == 1 or oscar == 2:
        oscar_points.append(0.3)
    elif oscar == 3 or oscar == 4 or oscar == 5:
        oscar_points.append(0.5)
    elif oscar >= 6 and oscar <= 10:
        oscar_points.append(1)
    elif oscar > 10:
        oscar_points.append(1.5)
    else:
        oscar_points.append(0)

In [11]:
# calculate the negative points for the smaller amount of raters
max_raters = max(top_20_rater)
point_deduction = [0.1 * (raters - max_raters) / 100000 for raters in top_20_rater]
point_deduction = [round(p, 1) for p in point_deduction]

In [12]:
# calculate the new rating of the movie, based on the number of oscars and the amount of raters
new_points = [top + oscar + deduction for top, oscar, deduction in zip(top_20_rating, oscar_points, point_deduction)]

In [18]:
# combine the data into one table and export it into xlsx
data = {'url': top_20_urls, 'name': top_20_names, 'rating': top_20_rating, 'Number of raters': top_20_rater, 'oscars': top_20_oscar, 'oscar extra point': oscar_points, 'point deduction for raters': point_deduction, 'new rating': new_points}
df = pd.DataFrame(data)
writer = pd.ExcelWriter('Datapao_IMDB_scraper.xlsx')
df.to_excel(writer, index=False, sheet_name='Top Movies')
writer.save()
df

Unnamed: 0,url,name,rating,Number of raters,oscars,oscar extra point,point deduction for raters,new rating
0,https://www.imdb.com/title/tt0111161/?ref_=adv...,The Shawshank Redemption,9.3,2704345,0,0.0,0.0,9.3
1,https://www.imdb.com/title/tt0068646/?ref_=adv...,The Godfather,9.2,1877746,3,0.5,-0.8,8.9
2,https://www.imdb.com/title/tt0468569/?ref_=adv...,The Dark Knight,9.0,2678106,2,0.3,-0.0,9.3
3,https://www.imdb.com/title/tt0071562/?ref_=adv...,The Godfather Part II,9.0,1282538,6,1.0,-1.4,8.6
4,https://www.imdb.com/title/tt0108052/?ref_=adv...,Schindler's List,9.0,1366837,7,1.0,-1.3,8.7
5,https://www.imdb.com/title/tt0050083/?ref_=adv...,12 Angry Men,9.0,798911,0,0.0,-1.9,7.1
6,https://www.imdb.com/title/tt0167260/?ref_=adv...,The Lord of the Rings: The Return of the King,9.0,1861945,11,1.5,-0.8,9.7
7,https://www.imdb.com/title/tt0110912/?ref_=adv...,Pulp Fiction,8.9,2076221,1,0.3,-0.6,8.6
8,https://www.imdb.com/title/tt0120737/?ref_=adv...,The Lord of the Rings: The Fellowship of the Ring,8.8,1891368,4,0.5,-0.8,8.5
9,https://www.imdb.com/title/tt1375666/?ref_=adv...,Inception,8.8,2376081,4,0.5,-0.3,9.0


In [19]:
print('Data is exported, please search for Datapao_IMDB_scraper.xlsx')

Data is exported, please search for Datapao_IMDB_scraper.xlsx
