In [1]:
import sqlite3
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Connect to your SQLite database
conn = sqlite3.connect('my_letterboxd_data.db')

# Load ratings data
query = """
SELECT username, movie_name, rating
FROM users
"""
ratings_df = pd.read_sql(query, conn)
ratings_df.dropna(subset=['rating'], inplace=True)
ratings_df['rating'] = ratings_df['rating'].astype(float)
ratings_df['username'] = ratings_df['username'].astype(str)
ratings_df['movie_name'] = ratings_df['movie_name'].astype(str)

query_movie_details = """
SELECT letterboxd_slug, movie_name, director, actors, genres
FROM film_details_small
"""
movie_details_df = pd.read_sql(query_movie_details, conn)


# Example of filtering out movies and users with fewer than a certain number of ratings
min_movie_ratings = 25 # Movies with fewer than 10 ratings
min_user_ratings = 50 # Users with fewer than 5 ratings
print(len(ratings_df))
filtered_ratings = ratings_df.groupby('movie_name').filter(lambda x: len(x) >= min_movie_ratings)
filtered_ratings = filtered_ratings.groupby('username').filter(lambda x: len(x) >= min_user_ratings)
# print('hello')
# Proceed with the filtered_ratings DataFrame
ratings_df = filtered_ratings
print(len(ratings_df))
conn.close()

# split the data into training and testing
from sklearn.model_selection import train_test_split

# train test split usernames
train_users, test_users = train_test_split(ratings_df['username'].unique(), test_size=0.2, random_state=42)

# split the data into training and testing
test_data = ratings_df[ratings_df['username'].isin(test_users)]
ratings_df = ratings_df[ratings_df['username'].isin(train_users)]



# Create a user-movie ratings matrix
user_movie_ratings = ratings_df.pivot_table(index='username', columns='movie_name', values='rating').fillna(0)

# Convert to sparse matrix
ratings_matrix = csr_matrix(user_movie_ratings.values)

# Apply SVD
svd = TruncatedSVD(n_components=20) # You can adjust the number of components
matrix_reduced = svd.fit_transform(ratings_matrix)

# Compute similarity scores
user_similarity = cosine_similarity(matrix_reduced)

def predict_top_movies(user_index, top_k=10):
    similarity_scores = list(enumerate(user_similarity[user_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_users_indices = [i[0] for i in similarity_scores[1:top_k+1]] # Skip self
    top_users_ratings = user_movie_ratings.iloc[top_users_indices].mean(axis=0)
    recommended_movies = top_users_ratings.sort_values(ascending=False).index.tolist()
    return recommended_movies[:top_k]

# Example usage
user_index = 0 # Assuming you want recommendations for the first user in the dataset
top_movies = predict_top_movies(user_index, top_k=10)
print(f"Top recommended movies: {top_movies}")


# print top predicted movies for specific user
user_index = user_movie_ratings.index.get_loc('nconterno')
top_movies = predict_top_movies(user_index, top_k=10)
print(f"Top recommended movies: {top_movies}")


15150737
14161837
Top recommended movies: ['whiplash-2014', 'saltburn', 'the-menu-2022', 'oppenheimer-2023', 'barbie', 'black-swan', 'interstellar', 'fight-club', 'la-la-land', 'scream']
Top recommended movies: ['the-dark-knight', 'everything-everywhere-all-at-once', 'parasite-2019', 'whiplash-2014', 'spider-man-into-the-spider-verse', 'mission-impossible-fallout', 'top-gun-maverick', 'arrival-2016', 'inglourious-basterds', 'the-batman']


In [2]:
conn = sqlite3.connect('my_letterboxd_data.db')
query_movie_details = """
SELECT letterboxd_slug, movie_name, director, actors, genres
FROM film_details_small
"""
movie_details_df = pd.read_sql(query_movie_details, conn)
conn.close()

#rename movie_name to  real_movie_name
movie_details_df.rename(columns={'movie_name': 'real_movie_name'}, inplace=True)
# rename letterboxd_slug to movie_name
movie_details_df.rename(columns={'letterboxd_slug': 'movie_name'}, inplace=True)

In [3]:
import numpy as np
from tqdm import tqdm

def predict_top_movies(user_index, top_k=10):
    # Compute similarity scores with other users
    similarity_scores = list(enumerate(user_similarity[user_index]))
    # Sort users by similarity score in descending order (most similar first)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get indices of top_k similar users (excluding the user itself which is at index 0)
    top_users_indices = []
    for i in (range(1, 1000)):  # Considering top 999 similar users after excluding the user itself
        top_users_indices.append(similarity_scores[i][0])
    
    # Select the ratings of these top users
    top_users_ratings = user_movie_ratings.iloc[top_users_indices]

    # Filter movies where less than 5 users rated it (non-zero ratings)
    valid_movies = top_users_ratings.apply(lambda x: x > 0).sum(axis=0) >= 5
    top_users_ratings = top_users_ratings.loc[:, valid_movies]

    # Calculate the mean of ratings, ignoring zeros
    recommended_movies = top_users_ratings.apply(lambda x: np.mean(x[x > 0]), axis=0)

    # remove movies not in the movie_details_df
    recommended_movies = recommended_movies[recommended_movies.index.isin(movie_details_df['movie_name'])]
    print('ayoo')
    # remove movies that are documentaries
    recommended_movies = recommended_movies[~recommended_movies.index.isin(movie_details_df[movie_details_df['genres'].str.contains('Documentary')]['movie_name'])]

    print('ayoo2')

    # remove movies the user has already rated
    user_rated_movies = user_movie_ratings.iloc[user_index]
    recommended_movies = recommended_movies[~recommended_movies.index.isin(user_rated_movies[user_rated_movies > 0].index)]

    # Sort the average ratings in descending order and select the top_k movies
    recommended_movies = recommended_movies.sort_values(ascending=False)
    return recommended_movies[:top_k]

# Example usage
user_index = user_movie_ratings.index.get_loc('nconterno')
top_movies = predict_top_movies(user_index, top_k=100)
print(f"Top recommended movies: {top_movies}")




ayoo
ayoo2
Top recommended movies: movie_name
the-lord-of-the-rings-2003         9.454545
a-brighter-summer-day              9.375000
the-godfather-part-ii              9.374046
chernobyl                          9.275229
mishima-a-life-in-four-chapters    9.263158
                                     ...   
short-cuts                         8.555556
a-man-escaped                      8.550000
three-colours-red                  8.545455
the-iron-giant                     8.542029
sunset-boulevard                   8.538922
Length: 100, dtype: float64


In [4]:
def predict_movies_for_new_user(new_user_ratings, top_k=10):
    # Integrate new user ratings into the existing user-movie matrix
    # Create a Series from the new user ratings, reindexing to match the columns of the existing matrix
    new_user_series = pd.Series(new_user_ratings).reindex(user_movie_ratings.columns).fillna(0)
    
    # Append this user to the existing matrix and transform using the existing SVD model
    new_user_vector = svd.transform(csr_matrix(new_user_series.values.reshape(1, -1)))

    # Compute cosine similarity between this new user and all other users
    new_user_similarity = cosine_similarity(new_user_vector, matrix_reduced).flatten()

    # Exclude the new user's self-comparison and get indices of top similar users
    top_users_indices = np.argsort(-new_user_similarity)[1:1000]
    top_users_ratings = user_movie_ratings.iloc[top_users_indices]

    # Filter movies where less than 5 users rated it (non-zero ratings)
    valid_movies = top_users_ratings.apply(lambda x: x > 0).sum(axis=0) >= 5
    top_users_ratings = top_users_ratings.loc[:, valid_movies]

    # Calculate the mean of ratings, ignoring zeros
    recommended_movies = top_users_ratings.apply(lambda x: np.mean(x[x > 0]), axis=0)

    # remove movies not in the movie_details_df
    recommended_movies = recommended_movies[recommended_movies.index.isin(movie_details_df['movie_name'])]
    print('ayoo')
    # remove movies that are documentaries
    recommended_movies = recommended_movies[~recommended_movies.index.isin(movie_details_df[movie_details_df['genres'].str.contains('Documentary')]['movie_name'])]

    print('ayoo2')

    # remove movies the user has already rated
    user_rated_movies = user_movie_ratings.iloc[user_index]
    # recommended_movies = recommended_movies[~recommended_movies.index.isin(user_rated_movies[user_rated_movies > 0].index)]

    # Sort the average ratings in descending order and select the top_k movies
    recommended_movies = recommended_movies.sort_values(ascending=False)
    return recommended_movies[:top_k]


print(test_data.shape)
test_user_movie_ratings = test_data.pivot_table(index='username', columns='movie_name', values='rating').fillna(0)
# example usage from random user from test data
test_user = test_user_movie_ratings.index[0]
test_user_ratings = test_user_movie_ratings.loc[test_user]
top_movies = predict_movies_for_new_user(test_user_ratings, top_k=100)
print(f"Top recommended movies for new user: {top_movies}")


(2800736, 3)
ayoo
ayoo2
Top recommended movies for new user: movie_name
the-lord-of-the-rings-2003             9.695652
the-red-shoes                          9.600000
high-and-low                           9.583333
the-holy-mountain                      9.500000
the-godfather-part-ii                  9.418605
                                         ...   
pink-floyd-the-wall                    8.733333
the-last-black-man-in-san-francisco    8.733333
sympathy-for-mr-vengeance              8.714286
midnight-cowboy                        8.714286
drive-my-car                           8.714286
Length: 100, dtype: float64


In [6]:
# using the test data to evaluate the model using MAE and MSE and predict_movies_for_new_user
from sklearn.metrics import mean_absolute_error, mean_squared_error

# for 100 users in the test data
mae = []
mse = []
# unique movies in the test data
unique_movies = test_data['movie_name'].unique()
for i in tqdm(range(10)):
    test_user = test_user_movie_ratings.index[i]
    test_user_ratings = test_user_movie_ratings.loc[test_user]
    top_movies = predict_movies_for_new_user(test_user_ratings, top_k=len(unique_movies))
    # get error for the user
    for movie in test_user_ratings.index:
        if movie in top_movies.index:
            mae.append(mean_absolute_error([test_user_ratings[movie]], [top_movies[movie]]))
            mse.append(mean_squared_error([test_user_ratings[movie]], [top_movies[movie]]))
        else:
            mae.append(mean_absolute_error([test_user_ratings[movie]], [0]))
            mse.append(mean_squared_error([test_user_ratings[movie]], [0]))

print(f"Mean Absolute Error: {np.mean(mae)}")
print(f"Mean Squared Error: {np.mean(mse)}")

  0%|          | 0/10 [00:00<?, ?it/s]

ayoo
ayoo2


 10%|█         | 1/10 [00:11<01:46, 11.82s/it]

ayoo
ayoo2


 20%|██        | 2/10 [00:23<01:34, 11.86s/it]

ayoo
ayoo2


 30%|███       | 3/10 [00:35<01:22, 11.78s/it]

ayoo
ayoo2


 40%|████      | 4/10 [00:46<01:09, 11.64s/it]

ayoo
ayoo2


 50%|█████     | 5/10 [00:59<00:59, 11.87s/it]

ayoo
ayoo2


 60%|██████    | 6/10 [01:10<00:47, 11.80s/it]

ayoo
ayoo2


 70%|███████   | 7/10 [01:22<00:35, 11.74s/it]

ayoo
ayoo2


 80%|████████  | 8/10 [01:34<00:23, 11.76s/it]

ayoo
ayoo2


 90%|█████████ | 9/10 [01:46<00:11, 11.83s/it]

ayoo
ayoo2


100%|██████████| 10/10 [01:57<00:00, 11.78s/it]

Mean Absolute Error: 0.5586850577266351
Mean Squared Error: 3.828424482103301





In [3]:
def predict_top_movies_with_ratings_by_username(username, top_k=10):
    if username not in user_movie_ratings.index:
        return "No data available for user: {}".format(username)
    
    # Find user index
    user_index = user_movie_ratings.index.get_loc(username)
    
    # Compute similarity scores for the specified user
    similarity_scores = list(enumerate(user_similarity[user_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get indices of top similar users excluding the user themselves
    top_users_indices = [i[0] for i in similarity_scores[1:top_k+1]]
    
    # Aggregate the ratings of these top similar users
    top_users_ratings = user_movie_ratings.iloc[top_users_indices].mean(axis=0)
    
    # Filter out movies the user has already seen
    movies_already_seen = user_movie_ratings.loc[username, user_movie_ratings.loc[username,:] > 0].index
    top_users_ratings_filtered = top_users_ratings.drop(movies_already_seen)
    
    # Sort the movies based on predicted ratings and return the top k
    recommended_movies_and_ratings = top_users_ratings_filtered.sort_values(ascending=False).head(top_k)
    return recommended_movies_and_ratings

# Example usage
username = '0000_q'
top_movies = predict_top_movies_with_ratings_by_username(username, top_k=500)
print(f"Top recommended movies for {username}: {top_movies}")

# print out first 10 usernames
print(user_movie_ratings.index[:10])

Top recommended movies for 0000_q: movie_name
la-la-land                                5.504
black-swan                                5.208
little-women-2019                         5.026
whiplash-2014                             4.970
the-perks-of-being-a-wallflower           4.872
                                          ...  
black-mirror-the-entire-history-of-you    0.608
green-book                                0.608
marley-me                                 0.608
okja                                      0.608
life-is-beautiful                         0.608
Length: 500, dtype: float64
Index(['0000_q', '004lio', '03_sats', '04danysolodany', '0511milou', '058218',
       '0714c', '098km', '09plutos', '0elle'],
      dtype='object', name='username')


NameError: name 'test_data' is not defined

In [4]:
def predict_top_movies_with_ratings_by_username(username, top_k=10):
    if username not in user_movie_ratings.index:
        return "No data available for user: {}".format(username)
    
    # Compute aggregate ratings across all users, weighted by similarity
    similarity_scores = user_similarity[user_movie_ratings.index.get_loc(username)]
    weighted_ratings = user_movie_ratings.multiply(similarity_scores, axis=0)
    aggregate_ratings = weighted_ratings.sum(axis=0) / similarity_scores.sum()
    
    # Filter out movies the user has already seen
    movies_already_seen = user_movie_ratings.loc[username, user_movie_ratings.loc[username,:] > 0].index
    aggregate_ratings_filtered = aggregate_ratings.drop(movies_already_seen)
    
    # Sort the movies based on predicted ratings and return the top k
    recommended_movies_and_ratings = aggregate_ratings_filtered.sort_values(ascending=False).head(top_k)
    return recommended_movies_and_ratings

# Example usage
username = 'nconterno'
top_movies = predict_top_movies_with_ratings_by_username(username, top_k=25)
print(f"Top recommended movies for {username}: {top_movies}")

Top recommended movies for nconterno: movie_name
parasite-2019                                       5.079031
spider-man-across-the-spider-verse                  4.253642
joker-2019                                          4.202354
the-wolf-of-wall-street                             3.702604
eternal-sunshine-of-the-spotless-mind               3.484730
kill-bill-vol-1                                     3.421067
guardians-of-the-galaxy                             3.385413
ratatouille                                         3.371113
the-matrix                                          3.312844
goodfellas                                          3.303334
shutter-island                                      3.277741
top-gun-maverick                                    3.239517
spirited-away                                       3.235921
black-swan                                          3.226378
the-lord-of-the-rings-the-fellowship-of-the-ring    3.217327
arrival-2016                        

In [12]:
# get mse for the test data
from tqdm import tqdm
import numpy as np
def predict_rating(username, movie_name):
    try:
        user_index = user_movie_ratings.index.get_loc(username)
        movie_index = user_movie_ratings.columns.get_loc(movie_name)
        similarity_scores = user_similarity[user_index]
        ratings = user_movie_ratings[movie_name]
        
        if np.sum(similarity_scores) != 0:
            return user_movie_ratings[movie_name].mean()  # Fallback to movie average
        predicted_rating = np.dot(similarity_scores, ratings) / np.sum(similarity_scores)
        return predicted_rating
    except KeyError:
        return ratings_df['rating'].mean()  # Fallback to global average if movie or user not found

#train test split
from sklearn.model_selection import train_test_split
t1, test_data = train_test_split(ratings_df, test_size=0.05)

# use for loop with tqdm to show progress
test_data['predicted_rating'] = [predict_rating(row['username'], row['movie_name']) for index, row in tqdm(test_data.iterrows(), total=len(test_data))]
mse = ((test_data['rating'] - test_data['predicted_rating'])**2).mean()
mae = (test_data['rating'] - test_data['predicted_rating']).abs().mean()
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 


[A

In [2]:
import sqlite3
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_recommenders as tfrs

# Assuming you've already loaded 'ratings_df' as shown in the previous code
# Connect to your SQLite database
conn = sqlite3.connect('my_letterboxd_data.db')

# Load ratings data
query = """
SELECT username, movie_name, rating
FROM users
"""
ratings_df = pd.read_sql(query, conn)
ratings_df.dropna(subset=['rating'], inplace=True)
ratings_df['rating'] = ratings_df['rating'].astype(float)
ratings_df['username'] = ratings_df['username'].astype(str)
ratings_df['movie_name'] = ratings_df['movie_name'].astype(str)

unique_user_ids = ratings_df['username'].unique()
unique_movie_titles = ratings_df['movie_name'].unique()

embedding_dimension = 128
# Load additional movie information
query_movie_info = """
SELECT movie_name, genre, director, release_year FROM movies
"""
movies_df = pd.read_sql(query_movie_info, conn)

# Merge the ratings with the movie information
full_df = pd.merge(ratings_df, movies_df, on="movie_name", how="left")

# Process categorical data and normalize numerical data as needed
# This step will depend on your specific dataset and features
class MovieRecommendationModel(tfrs.Model):
    def __init__(self, user_model, movie_model, task):
        super().__init__()
        self.movie_model: tf.keras.Model = movie_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features["user_id"])
        movie_embeddings = self.movie_model(features["movie_title"])
        return self.task(user_embeddings, movie_embeddings)

# Define the model components
user_model = tf.keras.Sequential([
    layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
    # Add more layers as needed
    layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

movie_model = tf.keras.Sequential([
    layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
    layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension),
    # Add layers for processing movie metadata
])

task = tfrs.tasks.Ranking(
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)


DatabaseError: Execution failed on sql '
SELECT movie_name, genre, director, release_year FROM movies
': no such table: movies

In [None]:
model = MovieRecommendationModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

# Assuming you have a dataset ready
# Convert 'full_df' to a TensorFlow dataset and split into train and test sets

tf_dataset = tf.data.Dataset.from_tensor_slices({
    "user_id": full_df["username"].values,
    "movie_title": full_df["movie_name"].values,
    # Include additional movie features here
})
tf_dataset = tf_dataset.batch(1024)

# Fit the model
model.fit(tf_dataset, epochs=5)
