In [1]:
import sqlite3
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Connect to your SQLite database
conn = sqlite3.connect('my_letterboxd_data.db')

# Load ratings data
query = """
SELECT username, movie_name, rating
FROM users
"""
ratings_df = pd.read_sql(query, conn)
ratings_df.dropna(subset=['rating'], inplace=True)
ratings_df['rating'] = ratings_df['rating'].astype(float)
ratings_df['username'] = ratings_df['username'].astype(str)
ratings_df['movie_name'] = ratings_df['movie_name'].astype(str)

query_movie_details = """
SELECT letterboxd_slug, movie_name, director, actors, genres
FROM film_details_small
"""
movie_details_df = pd.read_sql(query_movie_details, conn)
#rename movie_name to  real_movie_name
movie_details_df.rename(columns={'movie_name': 'real_movie_name'}, inplace=True)
# rename letterboxd_slug to movie_name
movie_details_df.rename(columns={'letterboxd_slug': 'movie_name'}, inplace=True)

# only keep the movies that are in the movie_details_df
ratings_df = ratings_df[ratings_df['movie_name'].isin(movie_details_df['movie_name'])]

# Example of filtering out movies and users with fewer than a certain number of ratings
min_movie_ratings = 25 # Movies with fewer than 10 ratings
min_user_ratings = 50 # Users with fewer than 5 ratings
print(len(ratings_df))
filtered_ratings = ratings_df.groupby('movie_name').filter(lambda x: len(x) >= min_movie_ratings)
filtered_ratings = filtered_ratings.groupby('username').filter(lambda x: len(x) >= min_user_ratings)
# print('hello')
# Proceed with the filtered_ratings DataFrame
ratings_df = filtered_ratings
print(len(ratings_df))
conn.close()

# split the data into training and testing
from sklearn.model_selection import train_test_split

# train test split usernames
train_users, test_users = train_test_split(ratings_df['username'].unique(), test_size=0.2, random_state=42)

# split the data into training and testing
test_data = ratings_df[ratings_df['username'].isin(test_users)]
ratings_df = ratings_df[ratings_df['username'].isin(train_users)]

print('test data made')


# Create a user-movie ratings matrix
user_movie_ratings = ratings_df.pivot_table(index='username', columns='movie_name', values='rating').fillna(0)

print('pivot table made')
# Convert to sparse matrix
ratings_matrix = csr_matrix(user_movie_ratings.values)

# Apply SVD
svd = TruncatedSVD(n_components=30) # You can adjust the number of components
matrix_reduced = svd.fit_transform(ratings_matrix)

print('svd made')
# Compute similarity scores
user_similarity = cosine_similarity(matrix_reduced)

def predict_top_movies(user_index, top_k=10):
    similarity_scores = list(enumerate(user_similarity[user_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_users_indices = [i[0] for i in similarity_scores[1:top_k+1]] # Skip self
    top_users_ratings = user_movie_ratings.iloc[top_users_indices].mean(axis=0)
    recommended_movies = top_users_ratings.sort_values(ascending=False).index.tolist()
    return recommended_movies[:top_k]

# Example usage
user_index = 0 # Assuming you want recommendations for the first user in the dataset
top_movies = predict_top_movies(user_index, top_k=10)
print(f"Top recommended movies: {top_movies}")


# print top predicted movies for specific user
user_index = user_movie_ratings.index.get_loc('nconterno')
top_movies = predict_top_movies(user_index, top_k=10)
print(f"Top recommended movies: {top_movies}")

# save the model and the user_movie_ratings
import pickle
with open('user_movie_ratings.pkl', 'wb') as f:
    pickle.dump(user_movie_ratings, f)
print('user_movie_ratings saved')
with open('user_similarity.pkl', 'wb') as f:
    pickle.dump(user_similarity, f)
print('user_similarity saved')
# save the svd model
with open('svd.pkl', 'wb') as f:
    pickle.dump(svd, f)
print('svd saved')
# load everything back

# predict top movies for a user
user_index = user_movie_ratings.index.get_loc('nconterno')
top_movies = predict_top_movies(user_index, top_k=10)
print(f"Top recommended movies: {top_movies}")

  from pandas.core import (


12632323
12571299
test data made
pivot table made
svd made
Top recommended movies: ['barbie', 'fight-club', 'black-swan', 'girl-interrupted', 'saltburn', 'whiplash-2014', 'gone-girl', 'knives-out-2019', 'get-out-2017', 'the-devil-wears-prada']
Top recommended movies: ['parasite-2019', 'inglourious-basterds', 'the-dark-knight', 'the-social-network', 'everything-everywhere-all-at-once', 'whiplash-2014', 'la-la-land', 'spider-man-into-the-spider-verse', 'the-batman', 'pulp-fiction']
user_movie_ratings saved
user_similarity saved
svd saved
Top recommended movies: ['parasite-2019', 'inglourious-basterds', 'the-dark-knight', 'the-social-network', 'everything-everywhere-all-at-once', 'whiplash-2014', 'la-la-land', 'spider-man-into-the-spider-verse', 'the-batman', 'pulp-fiction']


In [2]:
import pickle
with open('user_movie_ratings.pkl', 'rb') as f:
    user_movie_ratings = pickle.load(f)
print('user_movie_ratings loaded')
with open('user_similarity.pkl', 'rb') as f:
    user_similarity = pickle.load(f)
print('user_similarity loaded')
with open('svd.pkl', 'rb') as f:
    svd = pickle.load(f)
print('svd loaded')

user_movie_ratings loaded
user_similarity loaded
svd loaded


In [3]:
import numpy as np
from tqdm import tqdm

def predict_top_movies(user_index, top_k=10):
    # Compute similarity scores with other users
    similarity_scores = list(enumerate(user_similarity[user_index]))
    # Sort users by similarity score in descending order (most similar first)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get indices of top_k similar users (excluding the user itself which is at index 0)
    top_users_indices = []
    for i in (range(1, 1000)):  # Considering top 999 similar users after excluding the user itself
        top_users_indices.append(similarity_scores[i][0])
    
    # Select the ratings of these top users
    top_users_ratings = user_movie_ratings.iloc[top_users_indices]

    # Filter movies where less than 5 users rated it (non-zero ratings)
    valid_movies = top_users_ratings.apply(lambda x: x > 0).sum(axis=0) >= 5
    top_users_ratings = top_users_ratings.loc[:, valid_movies]

    # Calculate the mean of ratings, ignoring zeros
    recommended_movies = top_users_ratings.apply(lambda x: np.mean(x[x > 0]), axis=0)

    # remove movies not in the movie_details_df
    recommended_movies = recommended_movies[recommended_movies.index.isin(movie_details_df['movie_name'])]
    print('ayoo')
    # remove movies that are documentaries
    recommended_movies = recommended_movies[~recommended_movies.index.isin(movie_details_df[movie_details_df['genres'].str.contains('Documentary')]['movie_name'])]

    print('ayoo2')

    # remove movies the user has already rated
    user_rated_movies = user_movie_ratings.iloc[user_index]
    recommended_movies = recommended_movies[~recommended_movies.index.isin(user_rated_movies[user_rated_movies > 0].index)]

    # Sort the average ratings in descending order and select the top_k movies
    recommended_movies = recommended_movies.sort_values(ascending=False)
    return recommended_movies[:top_k]

# Example usage
user_index = user_movie_ratings.index.get_loc('nconterno')
top_movies = predict_top_movies(user_index, top_k=100)
print(f"Top recommended movies: {top_movies}")




ayoo
ayoo2
Top recommended movies: movie_name
the-lord-of-the-rings-2003          9.666667
macario                             9.625000
the-best-of-youth                   9.500000
dune-part-two                       9.400000
berserk-1997                        9.400000
                                      ...   
a-separation                        8.738318
witness-for-the-prosecution-1957    8.731707
once-upon-a-time-in-the-west        8.729508
red-beard                           8.727273
rififi                              8.727273
Length: 100, dtype: float64


In [4]:
def predict_movies_for_new_user(new_user_ratings, top_k=10):
    # Integrate new user ratings into the existing user-movie matrix
    # Create a Series from the new user ratings, reindexing to match the columns of the existing matrix
    new_user_series = pd.Series(new_user_ratings).reindex(user_movie_ratings.columns).fillna(0)
    
    # Append this user to the existing matrix and transform using the existing SVD model
    new_user_vector = svd.transform(csr_matrix(new_user_series.values.reshape(1, -1)))

    # Compute cosine similarity between this new user and all other users
    new_user_similarity = cosine_similarity(new_user_vector, matrix_reduced).flatten()

    # Exclude the new user's self-comparison and get indices of top similar users
    top_users_indices = np.argsort(-new_user_similarity)[1:1000]
    top_users_ratings = user_movie_ratings.iloc[top_users_indices]

    # Filter movies where less than 5 users rated it (non-zero ratings)
    valid_movies = top_users_ratings.apply(lambda x: x > 0).sum(axis=0) >= 5
    top_users_ratings = top_users_ratings.loc[:, valid_movies]

    # Calculate the mean of ratings, ignoring zeros
    recommended_movies = top_users_ratings.apply(lambda x: np.mean(x[x > 0]), axis=0)

    # remove movies not in the movie_details_df
    recommended_movies = recommended_movies[recommended_movies.index.isin(movie_details_df['movie_name'])]
    print('ayoo')
    # remove movies that are documentaries
    recommended_movies = recommended_movies[~recommended_movies.index.isin(movie_details_df[movie_details_df['genres'].str.contains('Documentary')]['movie_name'])]
    # remove TV Movie genre
    recommended_movies = recommended_movies[~recommended_movies.index.isin(movie_details_df[movie_details_df['genres'].str.contains('TV Movie')]['movie_name'])]
    # remove Music
    recommended_movies = recommended_movies[~recommended_movies.index.isin(movie_details_df[movie_details_df['genres'].str.contains('Music')]['movie_name'])]

    print('ayoo2')

    # remove movies the user has already rated
    user_rated_movies = user_movie_ratings.iloc[user_index]
    # recommended_movies = recommended_movies[~recommended_movies.index.isin(user_rated_movies[user_rated_movies > 0].index)]

    # Sort the average ratings in descending order and select the top_k movies
    recommended_movies = recommended_movies.sort_values(ascending=False)
    return recommended_movies[:top_k]


print(test_data.shape)
test_user_movie_ratings = test_data.pivot_table(index='username', columns='movie_name', values='rating').fillna(0)
# example usage from random user from test data
test_user = test_user_movie_ratings.index[0]
test_user_ratings = test_user_movie_ratings.loc[test_user]
top_movies = predict_movies_for_new_user(test_user_ratings, top_k=100)
print(f"Top recommended movies for new user: {top_movies}")


(2516863, 3)
ayoo
ayoo2
Top recommended movies for new user: movie_name
the-lord-of-the-rings-2003    10.000000
how-green-was-my-valley       10.000000
a-very-sunny-christmas         9.857143
gangs-of-wasseypur-part-2      9.571429
central-station                9.512195
                                ...    
funny-girl                     8.888889
vampire-hunter-d-bloodlust     8.888889
a-brighter-summer-day          8.888889
a-summers-tale                 8.888889
the-secret-in-their-eyes       8.885714
Length: 100, dtype: float64


In [5]:
# Splitting genres and actors into lists
movie_details_df['genres'] = movie_details_df['genres'].apply(lambda x: x.split(', ') if x else [])
movie_details_df['actors'] = movie_details_df['actors'].apply(lambda x: x.split(', ') if x else [])


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Assuming 'movies_details_df' has columns 'genres' and 'actors' properly formatted as lists of strings
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(movie_details_df['genres'])

print('genres encoded')
# Calculate actor frequency
actor_counts = movie_details_df['actors'].explode().value_counts()

# Select top N actors (e.g., top 100 actors)
top_actors = actor_counts.head(150).index

# Filter actors data to include only top actors
filtered_actors = movie_details_df['actors'].apply(lambda x: [actor for actor in x if actor in top_actors])


mlb_actors = MultiLabelBinarizer()
actors_encoded = mlb_actors.fit_transform(filtered_actors)
actor_columns = ['actor_' + col for col in mlb_actors.classes_]
df_actors_encoded = pd.DataFrame(actors_encoded, columns=actor_columns)
print('actors encoded')
# Adding prefixes to the new columns to avoid any overlap
genre_columns = ['genre_' + col for col in mlb_genres.classes_]
actor_columns = ['actor_' + col for col in mlb_actors.classes_]

print('genre columns')
#print out the genre columns
print(genre_columns)
# Creating DataFrames from the encoded arrays
df_genres_encoded = pd.DataFrame(genres_encoded, columns=genre_columns)
df_actors_encoded = pd.DataFrame(actors_encoded, columns=actor_columns)

print('df genres encoded')
# Joining the new DataFrames back to the original DataFrame
# Ensuring the index aligns if the DataFrame indexes have been altered
movie_details_df = movie_details_df.join(df_genres_encoded)
print('joined genres')
movie_details_df = movie_details_df.join(df_actors_encoded)
print('joined actors')
# Check the updated DataFrame
print(movie_details_df.head())
print(movie_details_df.shape)



genres encoded
actors encoded
genre columns
['genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_History', 'genre_Horror', 'genre_Music', 'genre_Mystery', 'genre_Romance', 'genre_Science Fiction', 'genre_TV Movie', 'genre_Thriller', 'genre_War', 'genre_Western']
df genres encoded
joined genres
joined actors
                             movie_name  \
0                                  nope   
1                        captain-marvel   
2  dungeons-dragons-honor-among-thieves   
3                   john-wick-chapter-4   
4                          cocaine-bear   

                           real_movie_name  \
0                                     Nope   
1                           Captain Marvel   
2  Dungeons & Dragons: Honor Among Thieves   
3                     John Wick: Chapter 4   
4                             Cocaine Bear   

                                 director  \
0

In [7]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

# Assuming `user_movie_ratings` is already available and `svd` model is trained
# Generate the SVD vectors for each user
ratings_matrix = csr_matrix(user_movie_ratings.values)
user_svd_vectors = svd.transform(ratings_matrix)  # This produces a matrix of shape (n_users, n_components=20)
print('user_svd_vectors made')
print(user_svd_vectors.shape)
# Create a DataFrame for SVD vectors, naming the columns for clarity
df_user_svd = pd.DataFrame(user_svd_vectors, index=user_movie_ratings.index, columns=[f'svd_{i}' for i in range(user_svd_vectors.shape[1])])
print('df_user_svd made')
print(df_user_svd.shape)
# Flatten the user-movie matrix to create a long format DataFrame
user_movie_long = user_movie_ratings.stack().reset_index()
user_movie_long.columns = ['username', 'movie_name', 'rating']
print('user_movie_long made')
print(user_movie_long.shape)

# remove and rows with rating 0
user_movie_long = user_movie_long[user_movie_long['rating'] > 0]
print('removed rows with rating 0')
# Merge with SVD vectors
# Ensure only relevant SVD columns are merged
# split user_movie_long into train and test
train_data, user_movie_long = train_test_split(user_movie_long, test_size=0.7, random_state=42)
user_movie_long = user_movie_long.merge(df_user_svd, on='username', how='left')
print('merged with svd')

# Merge with the movie details DataFrame
# Here, make sure that movie_details_df is prepared and contains only the necessary columns
# get column list from movie_details_df which contains all genre_columns and actor_columns and movie_name
columns = ['movie_name'] + genre_columns + actor_columns
user_movie_details = user_movie_long.merge(movie_details_df[columns], on='movie_name', how='left')
print('merged with movie details')

# The DataFrame `user_movie_details` now has the user's SVD vector, movie's genre and actor encoding, and the rating
# Display the final DataFrame to verify
print(user_movie_details.head())


user_svd_vectors made
(17446, 30)
df_user_svd made
(17446, 30)
user_movie_long made
(211742102, 3)
removed rows with rating 0
merged with svd
merged with movie details
        username             movie_name  rating       svd_0       svd_1  \
0    blovesfilms  the-book-of-life-2014     7.0  205.885276  -36.400303   
1  markkaiserman              eagle-eye     3.0  307.517043  135.001787   
2    spencer1993         absolute-power     7.0  206.846128   80.947388   
3     parzivalwb     return-of-the-jedi    10.0   42.939888  -25.894714   
4       filipapo        kill-bill-vol-1     8.0   67.276803  -27.201641   

        svd_2      svd_3       svd_4      svd_5      svd_6  ...  \
0    8.632431  39.958126   23.295988 -38.835703   4.625912  ...   
1   13.148530 -39.385565  126.323739  46.683650  75.392187  ...   
2  107.367080  -4.304481  -74.862611 -15.760836 -41.067279  ...   
3   11.535955 -21.022394  -13.796478  -0.734280  16.035644  ...   
4   -9.009210 -25.637562  -23.802986   7.10218

In [8]:
print(user_movie_details.shape)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Assume the DataFrame `user_movie_details` includes necessary numeric features
# Encode categorical features if they're not yet encoded
# Example for encoding (uncomment if needed):
# encoder = OneHotEncoder()
# encoded_features = encoder.fit_transform(user_movie_details[['genres', 'actors']].apply(lambda x: ','.join(x), axis=1))
print(user_movie_details.columns)
# remove and row where rating is 0
print(len(user_movie_details))
user_movie_details = user_movie_details[user_movie_details['rating'] != 0]
print(len(user_movie_details))
# Here, we consider only the SVD features and numeric encoding of genres and actors
X = user_movie_details.drop(['username', 'movie_name', 'rating'], axis=1)
y = user_movie_details['rating']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# make nn model
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='linear'))
# set learning rate
model.compile(optimizer='adam', loss='mse')

# set the learning rate
model.optimizer.lr = 0.001

# Train the model
model.fit(X_train, y_train, verbose=1, epochs = 3)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model using RMSE
mse = mean_squared_error(y_test, y_pred)
print(y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")


(7038106, 202)
Index(['username', 'movie_name', 'rating', 'svd_0', 'svd_1', 'svd_2', 'svd_3',
       'svd_4', 'svd_5', 'svd_6',
       ...
       'actor_Tom Cruise', 'actor_Tom Hanks', 'actor_Tommy Lee Jones',
       'actor_Tress MacNeille', 'actor_Udo Kier', 'actor_Wallace Shawn',
       'actor_Will Ferrell', 'actor_Willem Dafoe', 'actor_William H. Macy',
       'actor_Woody Harrelson'],
      dtype='object', length=202)
7038106
7038106


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/3
[1m175953/175953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 896us/step - loss: 3.7036
Epoch 2/3
[1m175953/175953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 903us/step - loss: 3.3685
Epoch 3/3
[1m175953/175953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 958us/step - loss: 3.2601
[1m43989/43989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 376us/step
[[7.0892377]
 [7.6253376]
 [6.310537 ]
 ...
 [6.4555826]
 [7.453272 ]
 [7.9161596]]
Root Mean Squared Error: 1.7924593836095242
Mean Absolute Error: 1.3861003101097058


In [9]:
# save the model
model.save('nn_model.keras')
print('model saved')

model saved


In [10]:
import pandas as pd
import numpy as np

def predict_top_movies_nn(user_index, top_k=10):
    # Get the SVD vector for the user
    user_svd_vector = df_user_svd.iloc[user_index].values.reshape(1, -1)

    # Get a list of movies that the user has not rated
    user_rated_movies = user_movie_ratings.iloc[user_index]
    unrated_movies = user_rated_movies[user_rated_movies == 0].index

    # Get the movie details for unrated movies
    unrated_movie_details = movie_details_df[movie_details_df['movie_name'].isin(unrated_movies)]

    # Assume the DataFrame `unrated_movie_details` already includes pre-processed and appropriately encoded features
    # Repeat the user SVD vector to match the number of unrated movies
    user_features = np.tile(user_svd_vector, (len(unrated_movies), 1))

    # Prepare the features for prediction
    X_pred = np.hstack([user_features, unrated_movie_details.drop(['movie_name', 'real_movie_name', 'director', 'genres', 'actors'], axis=1).values])
    # print out 1 row of X_pred
    # print(X_pred[0])
    # print('=====')
    # Predict ratings for the unrated movies
    y_pred = model.predict(X_pred)

    # Get the top_k movies with the highest predicted ratings
    top_indices = np.argsort(-y_pred.flatten())[:top_k]
    top_movies = unrated_movie_details.iloc[top_indices]['movie_name'].values
    top_ratings = y_pred.flatten()[top_indices]

    return top_movies, top_ratings

# Example usage
username = 'nconterno'
user_index = user_movie_ratings.index.get_loc(username)
top_movies, top_ratings = predict_top_movies_nn(user_index, top_k=1000)

# print out the top movies and ratings side by side
for i in range(len(top_movies)):
    print(f"{top_movies[i]}: {top_ratings[i]}")

# create movie_name to real_movie_name mapping


[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 364us/step
the-cider-house-rules: 9.870400428771973
apocalypse-now: 9.19291877746582
the-godfather-part-ii: 9.117862701416016
margaret: 9.1046724319458
for-love-of-the-game: 9.09072494506836
escape-to-victory: 8.903644561767578
spielberg: 8.879311561584473
maggies-plan: 8.856995582580566
a-bridge-too-far: 8.82575798034668
the-man-who-would-be-king: 8.80593204498291
the-hurricane: 8.802148818969727
zulu: 8.788313865661621
the-beaches-of-agnes: 8.749430656433105
whiplash-2013: 8.73904800415039
captain-corellis-mandolin: 8.704524993896484
the-kid-stays-in-the-picture: 8.698403358459473
stronger: 8.671281814575195
quills: 8.667285919189453
bird: 8.617585182189941
war-is-over-inspired-by-the-music-of-john-and-yoko: 8.588332176208496
joe-kidd: 8.564257621765137
things-to-do-in-denver-when-youre-dead: 8.559919357299805
one-flew-over-the-cuckoos-nest: 8.549381256103516
quincy: 8.549304962158203
unforgiven: 8.5132474899292
awaken

In [11]:
movie_name_to_real_movie_name = movie_details_df.set_index('movie_name')['real_movie_name'].to_dict()
# save the dictionary to a file
import pickle
with open('movie_name_to_real_movie_name.pkl', 'wb') as f:
    pickle.dump(movie_name_to_real_movie_name, f)

In [24]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import threading
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

semaphore = threading.Semaphore(10)

SCORE_DICT = {
    "½": 1,
    "★": 2,
    "★½": 3,
    "★★": 4,
    "★★½": 5,
    "★★★": 6,
    "★★★½": 7,
    "★★★★": 8,
    "★★★★½": 9,
    "★★★★★": 10,
}


def scrape_letterboxd_films(username, start_page=1, max_pages=None):
    """Scrapes film names from a Letterboxd user's 'films watched' pages.

    Args:
        username (str): The Letterboxd username.
        start_page (int): The page number to start scraping from. Defaults to 1.
        max_pages (int): The maximum number of pages to scrape. Defaults to None (scrape all).
    """

    films = []
    page_num = start_page

    while True:
        url = f"https://letterboxd.com/{username}/films/page/{page_num}/"
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        new_films=[]

        #find all the film elements, they are always after 'data-film-name'
        for film_element in soup.find_all('li', class_='poster-container'):
            title = film_element.find('div', class_='film-poster').get('data-film-slug')
            rating_element = film_element.find('span', class_='rating')

            if rating_element:
                rating = rating_element.text.strip()
                rating = SCORE_DICT.get(rating, None)
            else:
                rating = None
            new_films.append((title, rating))
            films.append((title, rating))


        page_num += 1
        if len(new_films) == 0 or (max_pages and page_num > max_pages):
            break

    return films

films = scrape_letterboxd_films('kgreed', start_page=1)
print('scraped films')
print(films)

# create user_vector for the user
user_vector = [0] * len(movie_details_df)
filmsSeen = []
for film in films:
    filmsSeen.append(film[0])
    if film[0] in movie_details_df['movie_name'].values:
        movie_index = movie_details_df[movie_details_df['movie_name'] == film[0]].index[0]
        if film[1] != None:
            user_vector[movie_index] = film[1]

# use svd to transform the user_vector
user_vector = svd.transform(np.array(user_vector).reshape(1, -1))

# merge the user_vector with the movie_details_df
df_user_vector = pd.DataFrame(user_vector, columns=[f'svd_{i}' for i in range(user_vector.shape[1])])
print('df_user_vector made')

# get recommended movies
user_features = np.tile(user_vector, (len(movie_details_df), 1))

X_pred = np.hstack([user_features, movie_details_df.drop(['movie_name', 'real_movie_name', 'director', 'genres', 'actors'], axis=1).values])
y_pred = model.predict(X_pred)

# Get the top_k movies with the highest predicted ratings
top_indices = np.argsort(-y_pred.flatten())[:1000]
top_movies = movie_details_df.iloc[top_indices]['movie_name'].values
top_ratings = y_pred.flatten()[top_indices]

# drop movies (and corresponding rating) the user has already seen 

top_ratings = [top_ratings[i] for i in range(len(top_movies)) if top_movies[i] not in filmsSeen]
top_movies = [top_movies[i] for i in range(len(top_movies)) if top_movies[i] not in filmsSeen]

# load the movie_name_to_real_movie_name dictionary
with open('movie_name_to_real_movie_name.pkl', 'rb') as f:
    movie_name_to_real_movie_name = pickle.load(f)

top_movies_real_names = [movie_name_to_real_movie_name[movie] for movie in top_movies]

# get highest rating
highest_rating = max(top_ratings)
if highest_rating>=10:
    top_ratings/=((highest_rating/10)+.01)

# print out the top movies and ratings side by side
for i in range(len(top_movies)):
    print(f"{top_movies_real_names[i]}: {top_ratings[i]}")





scraped films
[('barbie', 10), ('whiplash-2014', 10)]
df_user_vector made
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 597us/step
Madonna: Truth or Dare: 9.904483795166016
Spielberg: 9.8209867477417
Saving Private Ryan: 9.812543869018555
The Cider House Rules: 9.763993263244629
Quincy: 9.75294017791748
Dead Man: 9.638667106628418
Pulp Fiction: 9.517415046691895
Ride with the Devil: 9.487940788269043
Homecoming: A Film by Beyoncé: 9.416316032409668
Waltz with Bashir: 9.383439064025879
Guillermo del Toro's Pinocchio: 9.377897262573242
Summer of Soul (...Or, When the Revolution Could Not Be Televised): 9.310393333435059
Whitney: 9.302732467651367
Corpse Bride: 9.299956321716309
WAR IS OVER! Inspired by the Music of John & Yoko: 9.294678688049316
In Search of Darkness: Part II: 9.274405479431152
Guillermo del Toro's Pinocchio: Handcarved Cinema: 9.263187408447266
I Am Chris Farley: 9.201761245727539
Five Came Back: 9.19600772857666
Love & Mercy: 9.182246208190918
The G