In [1]:
import requests
from bs4 import BeautifulSoup
import sqlite3

In [2]:
url = 'https://letterboxd.com/jack/list/official-top-250-films-with-the-most-fans/page/{i}/'

pages = [1, 2, 3]

In [3]:
def get_soup(url):

	response = requests.get(url)
	return BeautifulSoup(response.text, 'html.parser')

In [4]:
def get_movie_list(soup):
	movie_list = []

	for movie in soup.find_all('div', class_='film-poster'):
		movie_list.append(f'https://letterboxd.com{movie.attrs['data-target-link']}')
	return movie_list

In [5]:
movie_list = [page for i in pages for page in get_movie_list(get_soup(url.format(i=i))) ]


In [6]:
movie_list

['https://letterboxd.com/film/la-la-land/',
 'https://letterboxd.com/film/interstellar/',
 'https://letterboxd.com/film/everything-everywhere-all-at-once/',
 'https://letterboxd.com/film/fight-club/',
 'https://letterboxd.com/film/dead-poets-society/',
 'https://letterboxd.com/film/eternal-sunshine-of-the-spotless-mind/',
 'https://letterboxd.com/film/whiplash-2014/',
 'https://letterboxd.com/film/little-women-2019/',
 'https://letterboxd.com/film/10-things-i-hate-about-you/',
 'https://letterboxd.com/film/parasite-2019/',
 'https://letterboxd.com/film/the-dark-knight/',
 'https://letterboxd.com/film/the-perks-of-being-a-wallflower/',
 'https://letterboxd.com/film/coraline/',
 'https://letterboxd.com/film/pride-prejudice/',
 'https://letterboxd.com/film/spirited-away/',
 'https://letterboxd.com/film/fantastic-mr-fox/',
 'https://letterboxd.com/film/howls-moving-castle/',
 'https://letterboxd.com/film/pulp-fiction/',
 'https://letterboxd.com/film/spider-man-into-the-spider-verse/',
 'ht

In [7]:
def get_movie_details(movie_url):
	response = requests.get(movie_url)
	soup = BeautifulSoup(response.text, 'html.parser')

	movie_details = {
		"title": soup.find('h1', class_='filmtitle').find('span').text,
		"cast": [{"name": actor.text} for actor in soup.find_all('a', href=lambda x: x and x.startswith('/actor/') )],
		"genres": [{"name": genre.text} for genre in soup.find_all('a', href=lambda x: x and x.startswith('/films/genre/') )],
		"authors": [{"name": author.text} for author in soup.find_all('a', href=lambda x: x and (x.startswith('/director/') or x.startswith('/writer/') or x.startswith('/producer/')))]
	}

	print(f'Finished {movie_details["title"]}')

	return movie_details

In [8]:
expanded_movie_list = []

for movie in movie_list:
	movie_details = get_movie_details(movie)
	if movie_details:
		expanded_movie_list.append(movie_details)

for movie in expanded_movie_list:
	print(movie)

Finished La La Land
Finished Interstellar
Finished Everything Everywhere All at Once
Finished Fight Club
Finished Dead Poets Society
Finished Eternal Sunshine of the Spotless Mind
Finished Whiplash
Finished Little Women
Finished 10 Things I Hate About You
Finished Parasite
Finished The Dark Knight
Finished The Perks of Being a Wallflower
Finished Coraline
Finished Pride & Prejudice
Finished Spirited Away
Finished Fantastic Mr. Fox
Finished Howl's Moving Castle
Finished Pulp Fiction
Finished Spider-Man: Into the Spider-Verse
Finished Good Will Hunting
Finished Call Me by Your Name
Finished The Shawshank Redemption
Finished Scream
Finished The Godfather
Finished Black Swan
Finished Lady Bird
Finished Donnie Darko
Finished Inception
Finished Scott Pilgrim vs. the World
Finished Aftersun
Finished GoodFellas
Finished The Grand Budapest Hotel
Finished Before Sunrise
Finished The Lord of the Rings: The Return of the King
Finished Back to the Future
Finished Little Miss Sunshine
Finished Inglo

In [9]:
import json

file_path = 'expanded_movie_list.json'

with open(file_path, 'w') as json_file:
	json.dump(expanded_movie_list, json_file, indent=4)

print(f'Data has been written to {file_path}')

Data has been written to expanded_movie_list.json


In [10]:

conn = sqlite3.connect('movies.db')
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS cast (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	name TEXT UNIQUE
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS genres (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	name TEXT UNIQUE
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS authors (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	name TEXT UNIQUE
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS movies (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	title TEXT UNIQUE
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS movie_cast (
	movie_id INTEGER,
	cast_id INTEGER,
	FOREIGN KEY (movie_id) REFERENCES movies(id),
	FOREIGN KEY (cast_id) REFERENCES cast(id)
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS movie_genres (
	movie_id INTEGER,
	genre_id INTEGER,
	FOREIGN KEY (movie_id) REFERENCES movies(id),
	FOREIGN KEY (genre_id) REFERENCES genres(id)
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS movie_authors (
	movie_id INTEGER,
	author_id INTEGER,
	FOREIGN KEY (movie_id) REFERENCES movies(id),
	FOREIGN KEY (author_id) REFERENCES authors(id)
)
''')

for movie in expanded_movie_list:
	cursor.execute('''
	INSERT INTO movies (title)
	VALUES (?)
	''', (movie['title'],))
	movie_id = cursor.lastrowid

	for cast_member in movie['cast']:
		cursor.execute('''
		INSERT OR IGNORE INTO cast (name)
		VALUES (?)
		''', (cast_member['name'],))
		cursor.execute('SELECT id FROM cast WHERE name = ?', (cast_member['name'],))
		cast_id = cursor.fetchone()[0]
		cursor.execute('''
		INSERT INTO movie_cast (movie_id, cast_id)
		VALUES (?, ?)
		''', (movie_id, cast_id))

	for genre in movie['genres']:
		cursor.execute('''
		INSERT OR IGNORE INTO genres (name)
		VALUES (?)
		''', (genre['name'],))
		cursor.execute('SELECT id FROM genres WHERE name = ?', (genre['name'],))
		genre_id = cursor.fetchone()[0]
		cursor.execute('''
		INSERT INTO movie_genres (movie_id, genre_id)
		VALUES (?, ?)
		''', (movie_id, genre_id))

	for author in movie['authors']:
		cursor.execute('''
		INSERT OR IGNORE INTO authors (name)
		VALUES (?)
		''', (author['name'],))
		cursor.execute('SELECT id FROM authors WHERE name = ?', (author['name'],))
		author_id = cursor.fetchone()[0]
		cursor.execute('''
		INSERT INTO movie_authors (movie_id, author_id)
		VALUES (?, ?)
		''', (movie_id, author_id))

conn.commit()
conn.close()

print('Data has been written to movies.db')

Data has been written to movies.db


In [11]:

watched_movies = ["Fight Club", "Dune: Part Two", "Gone Girl", "Her", "Joker"]

In [12]:
import sqlite3
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
conn = sqlite3.connect('movies.db')
movies_df = pd.read_sql_query("SELECT id AS movie_id, title FROM movies", conn)
authors_df = pd.read_sql_query("SELECT id AS author_id, name AS author_name FROM authors", conn)
genres_df = pd.read_sql_query("SELECT id AS genre_id, name AS genre_name FROM genres", conn)
cast_df = pd.read_sql_query("SELECT id AS cast_id, name AS cast_name FROM cast", conn)
movie_authors_df = pd.read_sql_query("SELECT movie_id, author_id FROM movie_authors", conn)
movie_genres_df = pd.read_sql_query("SELECT movie_id, genre_id FROM movie_genres", conn)
movie_cast_df = pd.read_sql_query("SELECT movie_id, cast_id FROM movie_cast", conn)
conn.close()

movie_authors_merged_df = movie_authors_df.merge(movies_df, on='movie_id').merge(authors_df, on='author_id')
movie_genres_merged_df = movie_genres_df.merge(movies_df, on='movie_id').merge(genres_df, on='genre_id')
movie_cast_merged_df = movie_cast_df.merge(movies_df, on='movie_id').merge(cast_df, on='cast_id')
authors_aggregated_df = movie_authors_merged_df.groupby(['movie_id', 'title'])['author_name'].apply(list).reset_index()
genres_aggregated_df = movie_genres_merged_df.groupby(['movie_id', 'title'])['genre_name'].apply(list).reset_index()
cast_aggregated_df = movie_cast_merged_df.groupby(['movie_id', 'title'])['cast_name'].apply(list).reset_index()

final_df = authors_aggregated_df.merge(genres_aggregated_df, on=['movie_id', 'title'], how='outer')
final_df = final_df.merge(cast_aggregated_df, on=['movie_id', 'title'], how='outer')
final_df['genres'] = final_df['genre_name'].apply(lambda x: x if isinstance(x, list) else [])
final_df['cast'] = final_df['cast_name'].apply(lambda x: x if isinstance(x, list) else [])
final_df['authors'] = final_df['author_name'].apply(lambda x: x if isinstance(x, list) else [])

final_df[['title', 'genres', 'cast', 'authors']].head(5)

Unnamed: 0,title,genres,cast,authors
0,La La Land,"[Drama, Comedy, Music, Romance]","[Ryan Gosling, Emma Stone, John Legend, Rosema...","[Damien Chazelle, Damien Chazelle, Jordan Horo..."
1,Interstellar,"[Science Fiction, Drama, Adventure]","[Matthew McConaughey, Anne Hathaway, Michael C...","[Christopher Nolan, Christopher Nolan, Lynda O..."
2,Everything Everywhere All at Once,"[Science Fiction, Adventure, Comedy, Action]","[Michelle Yeoh, Stephanie Hsu, Ke Huy Quan, Ja...","[Daniel Scheinert, Daniel Kwan, Daniel Scheine..."
3,Fight Club,[Drama],"[Edward Norton, Brad Pitt, Helena Bonham Carte...","[David Fincher, David Fincher, Ross Grayson Be..."
4,Dead Poets Society,[Drama],"[Robin Williams, Robert Sean Leonard, Ethan Ha...","[Peter Weir, Peter Weir, Steven Haft, Paul Jun..."


In [13]:
import numpy as np


mlb = MultiLabelBinarizer()

genres_encoded = mlb.fit_transform(final_df['genres'])
cast_encoded = mlb.fit_transform(final_df['cast'])
authors_encoded = mlb.fit_transform(final_df['authors'])

features = np.hstack([genres_encoded, cast_encoded, authors_encoded])


In [14]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors = 10 + len(watched_movies), algorithm='brute', metric='cosine')

knn.fit(features)


def recommend_movies_with_list(previously_watched, df, knn_model, feature_matrix):
    watched_indices = df[df['title'].isin(previously_watched)].index
    

    watched_features = feature_matrix[watched_indices]
    average_features = np.mean(watched_features, axis=0).reshape(1, -1)
    
    distances, indices = distances, indices = knn_model.kneighbors(average_features) 
    recommendations = df.iloc[indices[0]] 
    recommendations = recommendations[~recommendations['title'].isin(previously_watched)]
    
    movie_distances = distances[0][:len(recommendations)]
    movie_probabilities = 1 / (1 + movie_distances) 
    
    recommendations['probability'] = movie_probabilities
    
    return recommendations[['title', 'genres', 'cast', 'authors', 'probability']]

previously_watched_movies = ["La La Land"]

recommendations = recommend_movies_with_list(previously_watched_movies, final_df, knn, features)

recommendations[['title', 'probability']]


Unnamed: 0,title,probability
94,(500) Days of Summer,1.0
6,Whiplash,0.517264
97,Chungking Express,0.515158
122,Juno,0.512195
162,Babylon,0.511482
199,The Sound of Music,0.509393
156,Before Sunset,0.509133
41,Portrait of a Lady on Fire,0.508979
65,In the Mood for Love,0.508722
141,The Breakfast Club,0.508722
