In [8]:
import sqlite3
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Connect to your SQLite database
conn = sqlite3.connect('my_letterboxd_data.db')

# Load ratings data
query = """
SELECT username, movie_name, rating
FROM users
"""
ratings_df = pd.read_sql(query, conn)
ratings_df.dropna(subset=['rating'], inplace=True)
ratings_df['rating'] = ratings_df['rating'].astype(float)
ratings_df['username'] = ratings_df['username'].astype(str)
ratings_df['movie_name'] = ratings_df['movie_name'].astype(str)

query_movie_details = """
SELECT letterboxd_slug, movie_name, director, actors, genres
FROM film_details_small
"""
movie_details_df = pd.read_sql(query_movie_details, conn)


# Example of filtering out movies and users with fewer than a certain number of ratings
min_movie_ratings = 25 # Movies with fewer than 10 ratings
min_user_ratings = 50 # Users with fewer than 5 ratings
print(len(ratings_df))
filtered_ratings = ratings_df.groupby('movie_name').filter(lambda x: len(x) >= min_movie_ratings)
filtered_ratings = filtered_ratings.groupby('username').filter(lambda x: len(x) >= min_user_ratings)
# print('hello')
# Proceed with the filtered_ratings DataFrame
ratings_df = filtered_ratings
print(len(ratings_df))
conn.close()

# split the data into training and testing
from sklearn.model_selection import train_test_split

# train test split usernames
train_users, test_users = train_test_split(ratings_df['username'].unique(), test_size=0.2, random_state=42)

# split the data into training and testing
test_data = ratings_df[ratings_df['username'].isin(test_users)]
ratings_df = ratings_df[ratings_df['username'].isin(train_users)]



# Create a user-movie ratings matrix
user_movie_ratings = ratings_df.pivot_table(index='username', columns='movie_name', values='rating').fillna(0)

# Convert to sparse matrix
ratings_matrix = csr_matrix(user_movie_ratings.values)

# Apply SVD
svd = TruncatedSVD(n_components=20) # You can adjust the number of components
matrix_reduced = svd.fit_transform(ratings_matrix)

# Compute similarity scores
user_similarity = cosine_similarity(matrix_reduced)

def predict_top_movies(user_index, top_k=10):
    similarity_scores = list(enumerate(user_similarity[user_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_users_indices = [i[0] for i in similarity_scores[1:top_k+1]] # Skip self
    top_users_ratings = user_movie_ratings.iloc[top_users_indices].mean(axis=0)
    recommended_movies = top_users_ratings.sort_values(ascending=False).index.tolist()
    return recommended_movies[:top_k]

# Example usage
user_index = 0 # Assuming you want recommendations for the first user in the dataset
top_movies = predict_top_movies(user_index, top_k=10)
print(f"Top recommended movies: {top_movies}")


# print top predicted movies for specific user
user_index = user_movie_ratings.index.get_loc('nconterno')
top_movies = predict_top_movies(user_index, top_k=10)
print(f"Top recommended movies: {top_movies}")


15150737
14161837
Top recommended movies: ['saltburn', 'gone-girl', 'the-menu-2022', 'whiplash-2014', 'barbie', 'the-hunger-games-the-ballad-of-songbirds-snakes', 'fight-club', 'interstellar', 'la-la-land', 'black-swan']
Top recommended movies: ['the-dark-knight', 'everything-everywhere-all-at-once', 'parasite-2019', 'the-social-network', 'whiplash-2014', 'arrival-2016', 'spider-man-into-the-spider-verse', 'top-gun-maverick', 'mission-impossible-fallout', 'no-country-for-old-men']


In [9]:
conn = sqlite3.connect('my_letterboxd_data.db')
query_movie_details = """
SELECT letterboxd_slug, movie_name, director, actors, genres
FROM film_details_small
"""
movie_details_df = pd.read_sql(query_movie_details, conn)
conn.close()

#rename movie_name to  real_movie_name
movie_details_df.rename(columns={'movie_name': 'real_movie_name'}, inplace=True)
# rename letterboxd_slug to movie_name
movie_details_df.rename(columns={'letterboxd_slug': 'movie_name'}, inplace=True)

In [10]:
import numpy as np
from tqdm import tqdm

def predict_top_movies(user_index, top_k=10):
    # Compute similarity scores with other users
    similarity_scores = list(enumerate(user_similarity[user_index]))
    # Sort users by similarity score in descending order (most similar first)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get indices of top_k similar users (excluding the user itself which is at index 0)
    top_users_indices = []
    for i in (range(1, 1000)):  # Considering top 999 similar users after excluding the user itself
        top_users_indices.append(similarity_scores[i][0])
    
    # Select the ratings of these top users
    top_users_ratings = user_movie_ratings.iloc[top_users_indices]

    # Filter movies where less than 5 users rated it (non-zero ratings)
    valid_movies = top_users_ratings.apply(lambda x: x > 0).sum(axis=0) >= 5
    top_users_ratings = top_users_ratings.loc[:, valid_movies]

    # Calculate the mean of ratings, ignoring zeros
    recommended_movies = top_users_ratings.apply(lambda x: np.mean(x[x > 0]), axis=0)

    # remove movies not in the movie_details_df
    recommended_movies = recommended_movies[recommended_movies.index.isin(movie_details_df['movie_name'])]
    print('ayoo')
    # remove movies that are documentaries
    recommended_movies = recommended_movies[~recommended_movies.index.isin(movie_details_df[movie_details_df['genres'].str.contains('Documentary')]['movie_name'])]

    print('ayoo2')

    # remove movies the user has already rated
    user_rated_movies = user_movie_ratings.iloc[user_index]
    recommended_movies = recommended_movies[~recommended_movies.index.isin(user_rated_movies[user_rated_movies > 0].index)]

    # Sort the average ratings in descending order and select the top_k movies
    recommended_movies = recommended_movies.sort_values(ascending=False)
    return recommended_movies[:top_k]

# Example usage
user_index = user_movie_ratings.index.get_loc('nconterno')
top_movies = predict_top_movies(user_index, top_k=100)
print(f"Top recommended movies: {top_movies}")




ayoo
ayoo2
Top recommended movies: movie_name
dune-part-two                     9.492958
the-lord-of-the-rings-2003        9.454545
the-best-of-youth                 9.428571
the-big-city                      9.400000
a-brighter-summer-day             9.375000
                                    ...   
scenes-from-a-marriage-2021       8.673913
bo-burnham-inside                 8.671916
the-hunt-2012                     8.669091
marcel-the-shell-with-shoes-on    8.666667
monster-2004                      8.666667
Length: 100, dtype: float64


In [11]:
def predict_movies_for_new_user(new_user_ratings, top_k=10):
    # Integrate new user ratings into the existing user-movie matrix
    # Create a Series from the new user ratings, reindexing to match the columns of the existing matrix
    new_user_series = pd.Series(new_user_ratings).reindex(user_movie_ratings.columns).fillna(0)
    
    # Append this user to the existing matrix and transform using the existing SVD model
    new_user_vector = svd.transform(csr_matrix(new_user_series.values.reshape(1, -1)))

    # Compute cosine similarity between this new user and all other users
    new_user_similarity = cosine_similarity(new_user_vector, matrix_reduced).flatten()

    # Exclude the new user's self-comparison and get indices of top similar users
    top_users_indices = np.argsort(-new_user_similarity)[1:1000]
    top_users_ratings = user_movie_ratings.iloc[top_users_indices]

    # Filter movies where less than 5 users rated it (non-zero ratings)
    valid_movies = top_users_ratings.apply(lambda x: x > 0).sum(axis=0) >= 5
    top_users_ratings = top_users_ratings.loc[:, valid_movies]

    # Calculate the mean of ratings, ignoring zeros
    recommended_movies = top_users_ratings.apply(lambda x: np.mean(x[x > 0]), axis=0)

    # remove movies not in the movie_details_df
    recommended_movies = recommended_movies[recommended_movies.index.isin(movie_details_df['movie_name'])]
    print('ayoo')
    # remove movies that are documentaries
    recommended_movies = recommended_movies[~recommended_movies.index.isin(movie_details_df[movie_details_df['genres'].str.contains('Documentary')]['movie_name'])]

    print('ayoo2')

    # remove movies the user has already rated
    user_rated_movies = user_movie_ratings.iloc[user_index]
    # recommended_movies = recommended_movies[~recommended_movies.index.isin(user_rated_movies[user_rated_movies > 0].index)]

    # Sort the average ratings in descending order and select the top_k movies
    recommended_movies = recommended_movies.sort_values(ascending=False)
    return recommended_movies[:top_k]


print(test_data.shape)
test_user_movie_ratings = test_data.pivot_table(index='username', columns='movie_name', values='rating').fillna(0)
# example usage from random user from test data
test_user = test_user_movie_ratings.index[0]
test_user_ratings = test_user_movie_ratings.loc[test_user]
top_movies = predict_movies_for_new_user(test_user_ratings, top_k=100)
print(f"Top recommended movies for new user: {top_movies}")


(2800736, 3)
ayoo
ayoo2
Top recommended movies for new user: movie_name
monster-2004                                     9.700000
the-lord-of-the-rings-2003                       9.695652
high-and-low                                     9.636364
the-red-shoes                                    9.600000
the-holy-mountain                                9.571429
                                                   ...   
amadeus                                          8.972973
the-silence-of-the-lambs                         8.965517
the-batman                                       8.960739
neon-genesis-evangelion-the-end-of-evangelion    8.960526
logan-2017                                       8.956357
Length: 100, dtype: float64


In [12]:
# using the test data to evaluate the model using MAE and MSE and predict_movies_for_new_user
from sklearn.metrics import mean_absolute_error, mean_squared_error

# for 100 users in the test data
mae = []
mse = []
# unique movies in the test data
unique_movies = test_data['movie_name'].unique()
for i in tqdm(range(10)):
    test_user = test_user_movie_ratings.index[i]
    test_user_ratings = test_user_movie_ratings.loc[test_user]
    top_movies = predict_movies_for_new_user(test_user_ratings, top_k=len(unique_movies))
    # get error for the user
    for movie in test_user_ratings.index:
        if movie in top_movies.index:
            mae.append(mean_absolute_error([test_user_ratings[movie]], [top_movies[movie]]))
            mse.append(mean_squared_error([test_user_ratings[movie]], [top_movies[movie]]))
        else:
            mae.append(mean_absolute_error([test_user_ratings[movie]], [0]))
            mse.append(mean_squared_error([test_user_ratings[movie]], [0]))

print(f"Mean Absolute Error: {np.mean(mae)}")
print(f"Mean Squared Error: {np.mean(mse)}")
# print rmse
print(f"Root Mean Squared Error: {np.sqrt(np.mean(mse))}")

  0%|          | 0/10 [00:00<?, ?it/s]

ayoo
ayoo2


 10%|█         | 1/10 [00:11<01:46, 11.83s/it]

ayoo
ayoo2


 20%|██        | 2/10 [00:23<01:33, 11.69s/it]

ayoo
ayoo2


 30%|███       | 3/10 [00:35<01:21, 11.70s/it]

ayoo
ayoo2


 40%|████      | 4/10 [00:46<01:09, 11.59s/it]

ayoo
ayoo2


 50%|█████     | 5/10 [00:58<00:59, 11.89s/it]

ayoo
ayoo2


 60%|██████    | 6/10 [01:10<00:47, 11.80s/it]

ayoo
ayoo2


 70%|███████   | 7/10 [01:22<00:35, 11.73s/it]

ayoo
ayoo2


 80%|████████  | 8/10 [01:34<00:23, 11.77s/it]

ayoo
ayoo2


 90%|█████████ | 9/10 [01:46<00:11, 11.83s/it]

ayoo
ayoo2


100%|██████████| 10/10 [01:57<00:00, 11.75s/it]

Mean Absolute Error: 1.0259039890937587
Mean Squared Error: 6.972756546019952
Root Mean Squared Error: 2.640597763011238



