In [3]:
import sqlite3
import pandas as pd
import numpy as np
import tensorflow as tf
import sys
print(tf.__version__)
print(sys.executable)
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers

import tensorflow_recommenders as tfrs
from sklearn.model_selection import train_test_split

# Connect to SQLite database and load movie details
conn = sqlite3.connect('my_letterboxd_data.db')

# Load ratings data
query_ratings = """
SELECT username, movie_name, rating
FROM users
"""
ratings_df = pd.read_sql(query_ratings, conn)
# Check if the DataFrame is empty or if specific columns are empty
print(ratings_df.head())
print("Data types:", ratings_df.dtypes)
print("Count of non-NA values:\n", ratings_df.count())
# Load movie details
query_movie_details = """
SELECT letterboxd_slug, movie_name, director, actors, genres
FROM film_details_small
"""
movies_details_df = pd.read_sql(query_movie_details, conn) # REAL MOVIE NAME
# rename columns from movie_details_df
movies_details_df.rename(columns={'movie_name': 'real_movie_name'}, inplace=True)
movies_details_df.rename(columns={'letterboxd_slug': 'movie_name'}, inplace=True)
conn.close()


# get list of unique movie names sorted by count of ratings from ratings_df
movie_names = ratings_df['movie_name'].value_counts().index.tolist()

# dump the top 5000 movie names to a file
with open('movie_names.txt', 'w') as f:
    for item in movie_names[:5000]:
        f.write("%s\n" % item)

# Data preprocessing
ratings_df['rating'] = ratings_df['rating'].astype(float)
ratings_df = ratings_df.fillna(-1)
movies_details_df.fillna('', inplace=True)  # Handle missing values
print(ratings_df.head(2))
# Merge ratings with movie details
df = pd.merge(ratings_df, movies_details_df, on='movie_name', how='left')

# get list of unique movie names
movie_names = movies_details_df['movie_name'].unique()
# # Encoding categorical features
# from sklearn.preprocessing import LabelEncoder
# import joblib  # Import joblib

# def encode_and_save_column(column, name):
#     encoder = LabelEncoder()
#     transformed = encoder.fit_transform(column)
#     joblib.dump(encoder, f'{name}_encoder.joblib')  # Save the encoder
#     return transformed, len(encoder.classes_)

# for feature in ['username', 'movie_name', 'director', 'actors', 'genres']:
#     df[feature], num_classes = encode_and_save_column(df[feature],feature)
#     df[feature] = df[feature].astype('int64')

# print('hi')
# # Convert to TensorFlow dataset
# def df_to_dataset(dataframe, shuffle=True, batch_size=32):
#     df = dataframe.copy()
#     labels = df.pop('rating')
#     ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
#     if shuffle:
#         ds = ds.shuffle(buffer_size=len(dataframe))
#     ds = ds.batch(batch_size)
#     return ds
print(movies_details_df.columns)

print('hi2')






2.16.1
/opt/homebrew/Caskroom/miniconda/base/bin/python
   username             movie_name  rating
0  mmoorthy        mean-girls-2024     6.0
1  mmoorthy         anyone-but-you     4.0
2  mmoorthy    society-of-the-snow    10.0
3  mmoorthy               saltburn     8.0
4  mmoorthy  no-hard-feelings-2023     4.0
Data types: username       object
movie_name     object
rating        float64
dtype: object
Count of non-NA values:
 username      19687685
movie_name    19687685
rating        15150737
dtype: int64
   username       movie_name  rating
0  mmoorthy  mean-girls-2024     6.0
1  mmoorthy   anyone-but-you     4.0
Index(['movie_name', 'real_movie_name', 'director', 'actors', 'genres'], dtype='object')
hi2


In [4]:
# create a function that returns the average rating of a movie
# filter df to only have movies that appear at least 10 times


print('ell')
print(len(df))


from tqdm import tqdm
def create_map_movie_to_average_rating():
    # Group by 'movie_name' and calculate the mean of 'rating' for each group
    movie_to_rating = df.groupby('movie_name')['rating'].mean().to_dict()
    return movie_to_rating

m_to_r = create_map_movie_to_average_rating()


ell
19687685


In [5]:

# create a function that returns the top n movies for a user that they have not rated yet using get_movie_recommendations 
def get_user_recommendations(username, n_recommendations=10):
    # get all the movies that the user has not rated
    user_rated_movies = df[df['username'] == username]['movie_name']
    all_movies = df['movie_name'].unique()
    movies_to_rate = np.setdiff1d(all_movies, user_rated_movies)
    recommendations = []
    for movie in tqdm(movies_to_rate):
        recommendations.append((movie, m_to_r[movie]))
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return recommendations[0:n_recommendations]

# get the top 10 movie recommendations for a user
print(get_user_recommendations('nconterno', 10))


100%|██████████| 316184/316184 [00:00<00:00, 998064.95it/s] 

[('007', 10.0), ('08ms', 10.0), ('1-100-rice-planting', 10.0), ('1-3', 10.0), ('1-800-d-direct', 10.0), ('1-ri-botchi-no-ookami-to-7-hiki-no-ko-yagi', 10.0), ('10-45-in-a-city-like-any-other', 10.0), ('100-jahre-kino', 10.0), ('100-percent-electrical', 10.0), ('100-renewable-energy', 10.0)]





In [6]:
def compute_mse(df, m_to_r, n_users=10):
    mse = 0
    count = 0
    users = df['username'].unique()[:n_users]
    for user in tqdm(users):
        user_data = df[df['username'] == user]
        user_rated_movies = user_data['movie_name']
        for index, row in user_data.iterrows():
            predicted_rating = m_to_r[row['movie_name']]
            actual_rating = row['rating']
            mse += (predicted_rating - actual_rating) ** 2
            count += 1
    return mse / count if count else 0

# Example usage
print(compute_mse(df, m_to_r, 100))

def compute_mae(df, m_to_r, n_users=10):
    mae = 0
    count = 0
    users = df['username'].unique()[:n_users]
    for user in tqdm(users):
        user_data = df[df['username'] == user]
        user_rated_movies = user_data['movie_name']
        for index, row in user_data.iterrows():
            predicted_rating = m_to_r[row['movie_name']]
            actual_rating = row['rating']
            mae += abs(predicted_rating - actual_rating)
            count += 1
    return mae / count if count else 0

# Example usage
print(compute_mae(df, m_to_r, 100))


100%|██████████| 100/100 [00:54<00:00,  1.85it/s]


13.323628331235552


100%|██████████| 100/100 [02:02<00:00,  1.22s/it]

2.8947485126034658





In [8]:
df = df.groupby('movie_name').filter(lambda x: len(x) >= 15)

In [7]:
# implement svd model and compute mse and mae
from surprise import Dataset

from surprise import Reader
from surprise import SVD

from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(df[['username', 'movie_name', 'rating']], reader)
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)




KeyboardInterrupt: 

In [21]:
pip install surprise

Collecting surprise
  Using cached surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Using cached scikit-surprise-1.1.3.tar.gz (771 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-macosx_11_0_arm64.whl size=1114515 sha256=090e7aab243e78b8d63e45bebb1714fa46bd0505184933cf159192140cec298c
  Stored in directory: /Users/nicholasconterno/Library/Caches/pip/wheels/df/e4/a6/7ad72453dd693f420b0c639bedeec34641738d11b55d8d9b84
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1
Note: you may need to restart the kernel to use updated packages.
