In [1]:
import random
import re
from time import sleep

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from tqdm.auto import tqdm

In [2]:
url = 'https://www.liffe.si/urnik_2024/filmi.aspx?ln=sl&groupby=abc'
response = requests.get(url)

In [3]:
soup = BeautifulSoup(response.text, "html.parser")

def extract_title_year(row_tag):
    title = row_tag.find('div', attrs={'class': re.compile(r'naslov_filma_orig')}).text.strip('\n| ')
    year = row_tag.find('span', attrs={'id': re.compile(r'Label4')}).text.strip('\n| ')
    return title, year

movies = []
for row_tag in soup.find_all('div', attrs={'class': 'row'}):
    title = row_tag.find('div', attrs={'class': re.compile(r'naslov_filma_orig')}).text.strip('\n| ')
    year = row_tag.find('span', attrs={'id': re.compile(r'Label4')}).text.strip('\n| ')

    if title and year and int(year):
        movies.append(f'{title} ({year})')

movies = [m.replace('Vem (', 'Vem? (') if 'Vem' in m else m for m in movies]
movies[:5]

['Alpha. (2024)',
 'Ali je bilo kaj avantgardnega? (2024)',
 'Amen. (2002)',
 'Anora (2024)',
 'April (2024)']

In [4]:

def extract_movie_ratings(title_and_year: str):

    sleep(random.randint(1, 5))

    # set a headless driver
    ff_options = webdriver.FirefoxOptions()
    ff_options.add_argument("-headless")

    user_agent = (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36"
    )
    ff_options.add_argument(f"user-agent={user_agent}")

    driver = webdriver.Firefox(options=ff_options)
    driver.set_window_size(1080, 800)  # set the size of the window

    # get page content
    url = 'https://www.imdb.com/'
    driver.get(url)
    sleep(random.randint(1, 5))

    # search for the movie
    driver.find_element(By.ID, 'suggestion-search').send_keys(title_and_year)
    driver.find_element(By.ID, 'suggestion-search').send_keys(Keys.ENTER)
    sleep(random.randint(1, 5))

    driver.find_element(By.XPATH, '//*[contains(text(), "Exact matches")]').click()
    sleep(random.randint(1, 5))

    soup = BeautifulSoup(driver.page_source, "html.parser")
    tag = soup.find('li', attrs={'class': re.compile(r'find-title-result')})
    movie_url = url + tag.find('a').get('href').split('?')[0]
    driver.get(movie_url + 'ratings')

    sleep(random.randint(1, 5))
    soup = BeautifulSoup(driver.page_source, "html.parser")

    tag = soup.find('div', attrs={'data-testid':"rating-button__aggregate-rating"})
    score = tag.find('span').text
    votes = tag.find_all('div')[-1].text

    driver.quit()

    return {'title': title_and_year, 'score': score, 'votes': votes, 'url': movie_url}



In [5]:
n_tries = 10
results = []

while n_tries:
    for movie in tqdm(movies, leave=False):
        try:
            results.append(extract_movie_ratings(movie))
            movies.remove(movie)
        except Exception:
            # print(movie, e)
            continue
    n_tries -= 1

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

In [28]:
import pandas as pd

df = pd.DataFrame(results)
df.to_csv('movies.csv', index=False)

In [29]:
def float_vote(x: str):
    if 'K' in x:
        return float(x.replace('K', '')) * 1e3
    if 'M' in x:
        return float(x.replace('M', '')) * 1e6
    return float(x)

df.score = df.score.apply(float)
df.votes = df.votes.apply(float_vote)


In [30]:
print(df[df.votes > 100].sort_values('score', ascending=False).head(15).reset_index(drop=True).to_markdown())

|    | title                             |   score |   votes | url                                     |
|---:|:----------------------------------|--------:|--------:|:----------------------------------------|
|  0 | Anul Nou care n-a fost (2024)     |     8.7 |     915 | https://www.imdb.com//title/tt33030375/ |
|  1 | Anora (2024)                      |     8.3 |    4300 | https://www.imdb.com//title/tt28607951/ |
|  2 | Ainda estou aqui (2024)           |     8.1 |     610 | https://www.imdb.com//title/tt14961016/ |
|  3 | Young Hearts (2024)               |     8.1 |     565 | https://www.imdb.com//title/tt15245268/ |
|  4 | The Brutalist (2024)              |     8.1 |     798 | https://www.imdb.com//title/tt8999762/  |
|  5 | Z (1969)                          |     8.1 |   32000 | https://www.imdb.com//title/tt0065234/  |
|  6 | Flow (2024)                       |     8   |    2100 | https://www.imdb.com//title/tt4772188/  |
|  7 | Fekete pont (2024)                |     8   |   

|    | title                             |   score |   votes | url                                     |
|---:|:----------------------------------|--------:|--------:|:----------------------------------------|
|  0 | Anul Nou care n-a fost (2024)     |     8.7 |     915 | https://www.imdb.com//title/tt33030375/ |
|  1 | Anora (2024)                      |     8.3 |    4300 | https://www.imdb.com//title/tt28607951/ |
|  2 | Ainda estou aqui (2024)           |     8.1 |     610 | https://www.imdb.com//title/tt14961016/ |
|  3 | Young Hearts (2024)               |     8.1 |     565 | https://www.imdb.com//title/tt15245268/ |
|  4 | The Brutalist (2024)              |     8.1 |     798 | https://www.imdb.com//title/tt8999762/  |
|  5 | Z (1969)                          |     8.1 |   32000 | https://www.imdb.com//title/tt0065234/  |
|  6 | Flow (2024)                       |     8   |    2100 | https://www.imdb.com//title/tt4772188/  |
|  7 | Fekete pont (2024)                |     8   |     721 | https://www.imdb.com//title/tt30807200/ |
|  8 | Akai tenshi (1966)                |     7.8 |    2000 | https://www.imdb.com//title/tt0139820/  |
|  9 | L'aveu (1970)                     |     7.8 |    4900 | https://www.imdb.com//title/tt0065439/  |
| 10 | Conclave (2024)                   |     7.7 |    3100 | https://www.imdb.com//title/tt20215234/ |
| 11 | État de siege (1972)              |     7.7 |    6200 | https://www.imdb.com//title/tt0070959/  |
| 12 | Missing (1982)                    |     7.7 |   24000 | https://www.imdb.com//title/tt0084335/  |
| 13 | The Substance (2024)              |     7.6 |   84000 | https://www.imdb.com//title/tt17526714/ |
| 14 | The Seed of the Sacred Fig (2024) |     7.6 |    2000 | https://www.imdb.com//title/tt32178949/ |