In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import timeit

In [9]:
def top(time_period='',page=1):
    url = 'https://letterboxd.com/films/ajax/popular/'+time_period+'/size/small/'
    if page != 1:
        url = url+f'page/{page}/'
    page = requests.get(url)
    soup = BeautifulSoup(page.content,'html.parser')
    films_divs = soup.find_all(title=True)
    films = pd.DataFrame()
    films['Title'] = [films_divs[i]['title'][:-7] for i in range(len(films_divs))]
    films['Year'] = [films_divs[i]['title'][-5:-1] for i in range(len(films_divs))]
    films['href'] = [films_divs[i]['href'] for i in range(len(films_divs))]
    return films

In [10]:
def search_film(query):
    query = str(query).replace(' ','+')
    search_url = main+f'/search/films/{query}/'
    search_page = requests.get(search_url)
    search_soup = BeautifulSoup(search_page.content,'html.parser')
    try:
        first_result_url = main+search_soup.find(class_='film-detail-content').h2.a['href']
    except:
        raise Exception('The film you searched for was not found. Try searching for a different title.')
    return first_result_url

In [11]:
def get_genres(film_url):
    film_page = requests.get(film_url)
    if not film_page.status_code:
        raise Exception('The url does not lead to a valid Letterboxd page. Make sure you entered the correct url for the film.')
    film_page = requests.get(film_url)
    film_soup = BeautifulSoup(film_page.content,'html.parser')
    genre_links = film_soup.find(id='tab-genres').find_all('a')
    genres = [genre_links[i].string for i in range(len(genre_links))]
    return genres

In [12]:
def check_popularity(query):
    main = 'https://letterboxd.com'
    url = search_film(query)
    href = url.replace(main,'')
    page = 1
    films = top()
    while href not in films.href.values:
        page += 1
        films = top(page=page).set_index(films.index+72)
    film_info = films.loc[films.href==href]
    rank = film_info.index[0]+1
    print(f'{film_info.Title.iloc[0]} is ranked #{rank} in popularity')
    total_films = 540501
    obscurity = rank/total_films
    print(f'{film_info.Title.iloc[0]} is more obscure than {obscurity:.5f}% of films on Letterboxd.')
    return [film_info.Title.iloc[0],rank,rank/total_films]

In [13]:
def top_pages(time_period='',page_num=10):
    titles = []
    hrefs = []
    rank = []
    for top_page in range(1,page_num+1):
        url = 'https://letterboxd.com/films/ajax/popular/'+time_period+'/size/small/'
        if page_num != 1:
            url = url+f'page/{page_num}/'
        page = requests.get(url)
        soup = BeautifulSoup(page.content,'html.parser')
        films_divs = soup.find_all(title=True)
        for i in range(len(films_divs)):
            titles.append(films_divs[i]['title'])
            hrefs.append(films_divs[i]['href'])
            rank.append(i+1+(page_num-1)*72)
    return titles, hrefs, rank

In [19]:
main = 'https://letterboxd.com'
films = top()

In [20]:
main = 'https://letterboxd.com'
for ind in range(len(films)):
    film = films.iloc[ind]
    film_url = main+film.href
    genres = get_genres(film_url)
    films.loc[ind,'Genres'] = ','.join(genres)
films

Unnamed: 0,Title,Year,href,Genres
0,Parasite,2019,/film/parasite-2019/,"comedy,drama,thriller"
1,Joker,2019,/film/joker-2019/,"crime,drama,thriller"
2,Knives Out,2019,/film/knives-out-2019/,"mystery,thriller,comedy,drama,crime"
3,Get Out,2017,/film/get-out-2017/,"thriller,horror,mystery"
4,Pulp Fiction,1994,/film/pulp-fiction/,"crime,thriller"
...,...,...,...,...
67,The Favourite,2018,/film/the-favourite/,"comedy,history,drama"
68,Scott Pilgrim vs. the World,2010,/film/scott-pilgrim-vs-the-world/,"romance,comedy,action,fantasy,music"
69,It,2017,/film/it-2017/,horror
70,The Empire Strikes Back,1980,/film/the-empire-strikes-back/,"science fiction,adventure,action"


In [9]:
url = search_film('blade runner')
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
soup.find(class_='film-stats')

<ul class="film-stats"> <li class="stat"><a class="has-icon icon-placeholder"> </a></li> </ul>

In [39]:
url = search_film('star wars episode iii')
url

'https://letterboxd.com/film/star-wars-episode-iii-revenge-of-the-sith/'

In [105]:
check_popularity('scott pilgrim')

Scott Pilgrim vs. the World is ranked #69 in popularity
Scott Pilgrim vs. the World is more obscure than 0.00013% of films on Letterboxd.


['Scott Pilgrim vs. the World', 69, 0.00012765933828059523]

In [68]:
films

Unnamed: 0,Title,Year,href
144,Room,2015,/film/room-2015/
145,Toy Story 4,2019,/film/toy-story-4/
146,Harry Potter and the Prisoner of Azkaban,2004,/film/harry-potter-and-the-prisoner-of-azkaban/
147,Zootopia,2016,/film/zootopia/
148,Snowpiercer,2013,/film/snowpiercer/
...,...,...,...
211,The Hunger Games,2012,/film/the-hunger-games/
212,The Disaster Artist,2017,/film/the-disaster-artist/
213,Sicario,2015,/film/sicario-2015/
214,Fantastic Beasts and Where to Find Them,2016,/film/fantastic-beasts-and-where-to-find-them/


In [47]:
top(page=3)

Unnamed: 0,Title,Year,href
0,Room,2015,/film/room-2015/
1,Toy Story 4,2019,/film/toy-story-4/
2,Harry Potter and the Prisoner of Azkaban,2004,/film/harry-potter-and-the-prisoner-of-azkaban/
3,Zootopia,2016,/film/zootopia/
4,Snowpiercer,2013,/film/snowpiercer/
...,...,...,...
67,The Hunger Games,2012,/film/the-hunger-games/
68,The Disaster Artist,2017,/film/the-disaster-artist/
69,Sicario,2015,/film/sicario-2015/
70,Fantastic Beasts and Where to Find Them,2016,/film/fantastic-beasts-and-where-to-find-them/


In [55]:
url.replace(main,'') not in top(page=3).href.values

False

In [50]:
url.replace(main,'')

'/film/star-wars-episode-iii-revenge-of-the-sith/'

In [12]:
a = [0,1,2,3,4,5]
print(len(a))
print(range(len(a)))
print(range(1,len(a)+1))
b = [i for i in range(1,len(a)+1)]
print(b)

6
range(0, 6)
range(1, 7)
[1, 2, 3, 4, 5, 6]


In [8]:
titles, hrefs, ranks = top_pages(page_num=2)
titles

['Guardians of the Galaxy Vol. 2 (2017)',
 'The Lord of the Rings: The Fellowship of the Ring (2001)',
 '2001: A Space Odyssey (1968)',
 'Blade Runner (1982)',
 'Logan (2017)',
 'The Irishman (2019)',
 'Black Swan (2010)',
 'The Shawshank Redemption (1994)',
 'Star Wars: The Rise of Skywalker (2019)',
 'Back to the Future (1985)',
 'Nightcrawler (2014)',
 'Three Billboards Outside Ebbing, Missouri (2017)',
 'Booksmart (2019)',
 'Captain America: Civil War (2016)',
 'The Martian (2015)',
 'Rogue One: A Star Wars Story (2016)',
 'Spider-Man: Far from Home (2019)',
 'Bohemian Rhapsody (2018)',
 'The Revenant (2015)',
 'No Country for Old Men (2007)',
 'Captain Marvel (2019)',
 'Doctor Strange (2016)',
 'Reservoir Dogs (1992)',
 'Jurassic Park (1993)',
 'Alien (1979)',
 'GoodFellas (1990)',
 'Wonder Woman (2017)',
 'Shutter Island (2010)',
 'The Dark Knight Rises (2012)',
 'Captain America: The Winter Soldier (2014)',
 'The Lobster (2015)',
 'Annihilation (2018)',
 'The Hateful Eight (2015

In [1]:
a = ['Parasite (2019)', 'Joker (2019)', 'Other stuff (year)']
tuple(a)

('Parasite (2019)', 'Joker (2019)', 'Other stuff (year)')

In [2]:
('Parasite (2019)')

'Parasite (2019)'

In [3]:
tuple('Parasite (2019)')

('P', 'a', 'r', 'a', 's', 'i', 't', 'e', ' ', '(', '2', '0', '1', '9', ')')

NameError: name 'Parasite' is not defined

In [16]:
page_num=3
time_period=''
url = 'https://letterboxd.com/films/ajax/popular/'+time_period+'/size/small/'
if page_num != 1:
    url = url+f'page/{page_num}/'
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
films_divs = soup.find_all(title=True)
# for i in range(len(films_divs)):
#     titles.append(films_divs[i]['title'])
#     hrefs.append(films_divs[i]['href'])
#     rank.append(i+1+(page_num-1)*72)
films_divs

[<a class="frame" href="/film/room-2015/" title="Room (2015)"> <span class="frame-title">Room</span> </a>,
 <a class="frame" href="/film/gravity-2013/" title="Gravity (2013)"> <span class="frame-title">Gravity</span> </a>,
 <a class="frame" href="/film/toy-story-4/" title="Toy Story 4 (2019)"> <span class="frame-title">Toy Story 4</span> </a>,
 <a class="frame" href="/film/the-prestige/" title="The Prestige (2006)"> <span class="frame-title">The Prestige</span> </a>,
 <a class="frame" href="/film/snowpiercer/" title="Snowpiercer (2013)"> <span class="frame-title">Snowpiercer</span> </a>,
 <a class="frame" href="/film/zootopia/" title="Zootopia (2016)"> <span class="frame-title">Zootopia</span> </a>,
 <a class="frame" href="/film/good-will-hunting/" title="Good Will Hunting (1997)"> <span class="frame-title">Good Will Hunting</span> </a>,
 <a class="frame" href="/film/the-perks-of-being-a-wallflower/" title="The Perks of Being a Wallflower (2012)"> <span class="frame-title">The Perks of

In [42]:
page_num=10
for top_page in range(1,page_num+1):
    url = 'https://letterboxd.com/films/ajax/popular/'+time_period+'/size/small/'
    if page_num != 1:
        url = url+f'page/{page_num}/'
    page = requests.get(url)
    soup = BeautifulSoup(page.content,'html.parser')
    films_divs = soup.find_all(title=True)
    comp_start = timeit.default_timer()
    a = []
    a = [films_divs[i]['title'] for i in range(len(films_divs))]
    c = [i+1+(page_num-1)*72 for i in range(len(films_divs))]
    comp_end = timeit.default_timer()
    comp_time = comp_end-comp_start
    loop_start = timeit.default_timer()
    b = []
    d = []
    for i in range(len(films_divs)):
        b.append(films_divs[i]['title'])
        d.append(i+1+(page_num-1)*72)
    loop_end = timeit.default_timer()
    loop_time = loop_end-loop_start
    print(f"comp: {comp_time} s\nloop: {loop_time} s\nloop-comp: {loop_time-comp_time}")

comp: 4.55999997939216e-05 s
loop: 3.769999966607429e-05 s
loop-comp: -7.900000127847306e-06
comp: 3.839999953925144e-05 s
loop: 3.699999979289714e-05 s
loop-comp: -1.3999997463542968e-06
comp: 3.4300000152143184e-05 s
loop: 3.51999997292296e-05 s
loop-comp: 8.999995770864189e-07
comp: 4.6099999963189475e-05 s
loop: 3.749999996216502e-05 s
loop-comp: -8.600000001024455e-06
comp: 3.910000032192329e-05 s
loop: 3.590000051190145e-05 s
loop-comp: -3.1999998100218363e-06
comp: 8.539999998902204e-05 s
loop: 4.2100000428035855e-05 s
loop-comp: -4.329999956098618e-05
comp: 3.66000003850786e-05 s
loop: 3.5600000046542846e-05 s
loop-comp: -1.0000003385357559e-06
comp: 3.389999983482994e-05 s
loop: 3.370000013092067e-05 s
loop-comp: -1.9999970390927047e-07
comp: 3.730000025825575e-05 s
loop: 3.569999989849748e-05 s
loop-comp: -1.600000359758269e-06
comp: 5.8799999351322185e-05 s
loop: 6.880000000819564e-05 s
loop-comp: 1.0000000656873453e-05
