In [2]:
import sqlite3
import pandas as pd
import numpy as np
import tensorflow as tf
import sys
print(tf.__version__)
print(sys.executable)
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers

import tensorflow_recommenders as tfrs
from sklearn.model_selection import train_test_split

# Connect to SQLite database and load movie details
conn = sqlite3.connect('my_letterboxd_data.db')

# Load ratings data
query_ratings = """
SELECT username, movie_name, rating
FROM users
"""
ratings_df = pd.read_sql(query_ratings, conn)
# Check if the DataFrame is empty or if specific columns are empty
print(ratings_df.head())
print("Data types:", ratings_df.dtypes)
print("Count of non-NA values:\n", ratings_df.count())
# Load movie details
query_movie_details = """
SELECT letterboxd_slug, movie_name, director, actors, genres
FROM film_details_small
"""
movies_details_df = pd.read_sql(query_movie_details, conn) # REAL MOVIE NAME
# rename columns from movie_details_df
movies_details_df.rename(columns={'movie_name': 'real_movie_name'}, inplace=True)
movies_details_df.rename(columns={'letterboxd_slug': 'movie_name'}, inplace=True)
conn.close()


# get list of unique movie names sorted by count of ratings from ratings_df
movie_names = ratings_df['movie_name'].value_counts().index.tolist()

# dump the top 5000 movie names to a file
with open('movie_names.txt', 'w') as f:
    for item in movie_names[:5000]:
        f.write("%s\n" % item)

# Data preprocessing
ratings_df['rating'] = ratings_df['rating'].astype(float)
ratings_df = ratings_df.fillna(-1)
movies_details_df.fillna('', inplace=True)  # Handle missing values
print(ratings_df.head(2))
# Merge ratings with movie details
df = pd.merge(ratings_df, movies_details_df, on='movie_name', how='left')

# get list of unique movie names
movie_names = movies_details_df['movie_name'].unique()
# # Encoding categorical features
# from sklearn.preprocessing import LabelEncoder
# import joblib  # Import joblib

# def encode_and_save_column(column, name):
#     encoder = LabelEncoder()
#     transformed = encoder.fit_transform(column)
#     joblib.dump(encoder, f'{name}_encoder.joblib')  # Save the encoder
#     return transformed, len(encoder.classes_)

# for feature in ['username', 'movie_name', 'director', 'actors', 'genres']:
#     df[feature], num_classes = encode_and_save_column(df[feature],feature)
#     df[feature] = df[feature].astype('int64')

# print('hi')
# # Convert to TensorFlow dataset
# def df_to_dataset(dataframe, shuffle=True, batch_size=32):
#     df = dataframe.copy()
#     labels = df.pop('rating')
#     ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
#     if shuffle:
#         ds = ds.shuffle(buffer_size=len(dataframe))
#     ds = ds.batch(batch_size)
#     return ds
print(movies_details_df.columns)

print('hi2')

2.16.1
/opt/homebrew/Caskroom/miniconda/base/bin/python
   username             movie_name  rating
0  mmoorthy        mean-girls-2024     6.0
1  mmoorthy         anyone-but-you     4.0
2  mmoorthy    society-of-the-snow    10.0
3  mmoorthy               saltburn     8.0
4  mmoorthy  no-hard-feelings-2023     4.0
Data types: username       object
movie_name     object
rating        float64
dtype: object
Count of non-NA values:
 username      19687685
movie_name    19687685
rating        15150737
dtype: int64
   username       movie_name  rating
0  mmoorthy  mean-girls-2024     6.0
1  mmoorthy   anyone-but-you     4.0
Index(['movie_name', 'real_movie_name', 'director', 'actors', 'genres'], dtype='object')
hi2


In [3]:
#dump movie names from movie_details_df to a file
movie_names = movies_details_df['movie_name'].unique()
print(len(movie_names))

4885


In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD

# Assuming 'df' contains your ratings data with columns 'username', 'movie_name', and 'rating'

# Display initial count
print("Initial number of ratings:", len(df))

# Remove movies with less than 5000 ratings
df = df.groupby('movie_name').filter(lambda x: len(x) >= 1000)
print("Number of ratings after filtering movies:", len(df))

# filter out movie_name that are not in movies_details_df
df = df[df['movie_name'].isin(movie_names)]


# Remove users with less than 100 ratings
df = df.groupby('username').filter(lambda x: len(x) >= 50)
print("Number of ratings after filtering users:", len(df))

# Display number of unique movies remaining
print("Number of unique movies:", len(df['movie_name'].unique()))

# Create a mapping of movie names to a consistent index
movie_index = {movie: idx for idx, movie in enumerate(df['movie_name'].unique())}
num_movies = len(movie_index)  # Number of movies

# Initialize a dictionary to hold the user vectors
user_vectors = {}

# Iterate over grouped data
for username, group in tqdm(df.groupby('username'), desc="Building user vectors"):
    # Create an empty vector for each user with all elements initialized to a placeholder (e.g., -1)
    user_vector = np.full(num_movies, -1, dtype=np.float32)
    # Populate the vector with ratings
    for _, row in group.iterrows():
        movie_idx = movie_index[row['movie_name']]
        user_vector[movie_idx] = row['rating']  # Store the rating
    # Store the user vector
    user_vectors[username] = user_vector

# Choose the number of components, e.g., reduce dimensions to 50
svd = TruncatedSVD(n_components=50, random_state=42)

# List of all usernames for maintaining order
# usernames = list(user_vectors.keys())
# # Stack all user vectors to form a matrix
# user_matrix = np.stack(user_vectors.values())
# svd = TruncatedSVD(n_components=150, random_state=42)
# user_matrix_reduced = svd.fit_transform(user_matrix)
# print("Reduced user vectors shape:", user_matrix_reduced.shape)
# reduced_user_vectors = {username: vector for username, vector in zip(usernames, user_matrix_reduced)}

# user_vectors = reduced_user_vectors

# print all unique movie names


Initial number of ratings: 13422622
Number of ratings after filtering movies: 13422622
Number of ratings after filtering users: 13376552
Number of unique movies: 4885


Building user vectors: 100%|██████████| 22876/22876 [03:51<00:00, 98.85it/s] 


In [6]:
print(user_vectors['nconterno'])

[ 5.  7.  7. ... -1. -1. -1.]


In [7]:
#dump all unique movie names to a txt file
with open('movie_names.txt', 'w') as f:
    for item in movie_index.keys():
        f.write("%s\n" % item)

In [8]:
# filter ratings_df to only include movies and users that are in movie_index
ratings_df = ratings_df[ratings_df['movie_name'].isin(movie_index.keys())]
ratings_df = ratings_df[ratings_df['username'].isin(user_vectors.keys())]

movies_details_df = movies_details_df[movies_details_df['movie_name'].isin(movie_index.keys())]

print(len(ratings_df))
# printlength of unique movie names in ratings_df
print(len(ratings_df['movie_name'].unique()))
print(ratings_df['movie_name'].unique())
print(len(movies_details_df))

13376552
4885
['priscilla' 'barbie' 'guardians-of-the-galaxy-vol-3' ...
 'the-last-witch-hunter' 'luther-the-fallen-sun'
 'barbie-and-the-three-musketeers']
4885


In [9]:
# Splitting genres and actors into lists
movies_details_df['genres'] = movies_details_df['genres'].apply(lambda x: x.split(', ') if x else [])
movies_details_df['actors'] = movies_details_df['actors'].apply(lambda x: x.split(', ') if x else [])


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Assuming 'movies_details_df' has columns 'genres' and 'actors' properly formatted as lists of strings
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(movies_details_df['genres'])

print('genres encoded')
# Calculate actor frequency
actor_counts = movies_details_df['actors'].explode().value_counts()

# Select top N actors (e.g., top 100 actors)
top_actors = actor_counts.head(50).index

# Filter actors data to include only top actors
filtered_actors = movies_details_df['actors'].apply(lambda x: [actor for actor in x if actor in top_actors])


mlb_actors = MultiLabelBinarizer()
actors_encoded = mlb_actors.fit_transform(filtered_actors)
actor_columns = ['actor_' + col for col in mlb_actors.classes_]
df_actors_encoded = pd.DataFrame(actors_encoded, columns=actor_columns)
print('actors encoded')
# Adding prefixes to the new columns to avoid any overlap
genre_columns = ['genre_' + col for col in mlb_genres.classes_]
actor_columns = ['actor_' + col for col in mlb_actors.classes_]

print('genre columns')
# Creating DataFrames from the encoded arrays
df_genres_encoded = pd.DataFrame(genres_encoded, columns=genre_columns)
df_actors_encoded = pd.DataFrame(actors_encoded, columns=actor_columns)

print('df genres encoded')
# Joining the new DataFrames back to the original DataFrame
# Ensuring the index aligns if the DataFrame indexes have been altered
movies_details_df = movies_details_df.join(df_genres_encoded)
print('joined genres')
movies_details_df = movies_details_df.join(df_actors_encoded)
print('joined actors')
# Check the updated DataFrame
print(movies_details_df.head())
print(movies_details_df.shape)



genres encoded
actors encoded
genre columns
df genres encoded
joined genres
joined actors
                             movie_name  \
0                                  nope   
1                        captain-marvel   
2  dungeons-dragons-honor-among-thieves   
3                   john-wick-chapter-4   
4                          cocaine-bear   

                           real_movie_name  \
0                                     Nope   
1                           Captain Marvel   
2  Dungeons & Dragons: Honor Among Thieves   
3                     John Wick: Chapter 4   
4                             Cocaine Bear   

                                 director  \
0                            Jordan Peele   
1                  Ryan Fleck, Anna Boden   
2  Jonathan Goldstein, John Francis Daley   
3                          Chad Stahelski   
4                         Elizabeth Banks   

                                              actors  \
0  [Daniel Kaluuya, Keke Palmer, Brandon Perea,

In [11]:
# make new datafram from user_vectors and ratings_df

# # Load ratings data
# query_ratings = """
# SELECT username, movie_name, rating
# FROM users
# """
# ratings_df = pd.read_sql(query_ratings, conn)

# user_vectors_df = pd.DataFrame(user_vectors).T  # Transpose the DataFrame
# # print(len(user_vectors))
# # # Reset the index to ensure the 'username' column is available for merging
# user_vectors_df = user_vectors_df.reset_index().rename(columns={'index': 'username'})

# # Merge the user vectors with the ratings data
# ratings_df = pd.merge(ratings_df, user_vectors_df, on='username', how='left')
# print(ratings_df.shape)
# # Check the updated DataFrame
# print(ratings_df.head())
# print(ratings_df.shape)


# merge ratings_df with movies_details_df
ratings_df = pd.merge(ratings_df, movies_details_df, on='movie_name', how='left')
print(ratings_df.shape)




(13376552, 76)


In [12]:
# print ratings_df columns that are not numeric
# get unique user ids and movie titles
# unique_user_ids = ratings_df.index.unique()
# unique_movie_titles = ratings_df['movie_name'].unique()

non_numeric_columns = ratings_df.select_dtypes(exclude=[np.number]).columns
print(non_numeric_columns)

# drop username from non_numeric_columns
non_numeric_columns = non_numeric_columns.drop('username')

# Drop non-numeric columns
ratings_df = ratings_df.drop(columns=non_numeric_columns)

Index(['username', 'movie_name', 'real_movie_name', 'director', 'actors',
       'genres'],
      dtype='object')


In [None]:
# create unique username to numeric id mapping
====
unique_user_ids = ratings_df['username'].unique()

user_id_mapping = {name: i for i, name in enumerate(unique_user_ids)}

# replace username with user_id
ratings_df['username'] = ratings_df['username'].map(user_id_mapping)

SyntaxError: invalid syntax (1785846212.py, line 2)

In [None]:
print(user_id_mapping)

# create user_id to username mapping
user_id_to_username = {i: name for i, name in enumerate(unique_user_ids)}

print(user_id_to_username)

{'nsdamera': 0, 'm__gabby': 1, 'dmirza19': 2, 'miawoody': 3, 'imwilliamherff': 4, 'pripriyaya': 5, 'willconvertino': 6, 'wyattgf': 7, 'tackky': 8, 'davidt02': 9, 'mermun': 10, 'itsjustamovie': 11, 'petercaragol': 12, 'belafilm': 13, 'oatibix': 14, 'vaishakh2001': 15, 'joshf101': 16, 'sophierees': 17, 'walker_walker': 18, 'charlieveronee': 19, 'grant_hill_33': 20, 'jezzaf': 21, 'rettg2019': 22, 'magreen325': 23, 'andrewkatso': 24, 'bclay12': 25, 'flyingebitda': 26, 'thomas10': 27, 'saschaseinfeld': 28, 'david1stewart': 29, 'notnatalie': 30, 'coledwards101': 31, 'sarahmoorman': 32, 'munchsmith': 33, 'jrr63': 34, 'drewhask95': 35, 'yvngocupotis': 36, 'jyk14': 37, 'reesethepieces': 38, 'henrywya': 39, 'bfinby1': 40, 'danmanme': 41, 'christianolsn': 42, 'shockacohen': 43, 'reesemoorman': 44, 'esnides1': 45, 'elkoobador': 46, 'zhukeeper': 47, 'dores93': 48, 'dan_felix': 49, 'nconterno': 50, 'jacksonlkair': 51, 'rishikr1': 52, 'nicoroldan': 53, 'marylawr': 54, 'pico_dico': 55, 'saul1233456': 

In [None]:
=====

In [13]:
import pandas as pd
# Convert all column names to string
ratings_df.columns = ratings_df.columns.map(str)

train_df, test_df = train_test_split(ratings_df, test_size=0.2)

# Columns that start with 'genre_' or 'actor_' are movie features
movie_feature_cols = [col for col in ratings_df.columns if col.startswith('genre_') or col.startswith('actor_')]



# User features are all other columns except 'rating' and movie features
user_feature_cols = [col for col in ratings_df.columns if col not in movie_feature_cols + ['rating']]

# add username to movie features
user_feature_cols.append('username')
# Labels are the ratings
label_col = 'rating'

def df_to_dataset(dataframe, user_vectors, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('rating')
    movie_features = dataframe[movie_feature_cols].values  # Convert DF to numpy for TensorFlow compatibility

    # Prepare user features
    user_features_list = []
    for username in tqdm(dataframe['username']):
        
        user_features_list.append(user_vectors[username])
       
    user_features = pd.DataFrame(user_features_list)  # Convert list of lists to DataFrame

    features = {'user_features': user_features.values, 'movie_features': movie_features}

    # Create a TensorFlow dataset
    ds = tf.data.Dataset.from_tensor_slices((features, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size, drop_remainder=True)
    return ds

# Create TensorFlow datasets
train_ds = df_to_dataset(train_df, user_vectors, batch_size=256)
test_ds = df_to_dataset(test_df, user_vectors, shuffle=False, batch_size=256)


100%|██████████| 10701241/10701241 [00:05<00:00, 2100199.09it/s]


: 

: 

In [None]:
========

In [None]:
# Checking the first few user vectors to understand their structure
for key, value in list(user_vectors.items())[:5]:
    print(f"Username: {key}, Vector: {value}, Type: {type(value)}, Shape: {np.array(value).shape}")

# If user vectors are confirmed to be arrays and still the shape is wrong:
# Reformat the user_vectors if they are not in the correct format
user_vectors = {k: np.array(v) if not isinstance(v, np.ndarray) else v for k, v in user_vectors.items()}


Username: 0000_q, Vector: [-1. -1.  9. ... -1. -1. -1.], Type: <class 'numpy.ndarray'>, Shape: (4885,)
Username: 004lio, Vector: [ 8.  5. -1. ... -1. -1. -1.], Type: <class 'numpy.ndarray'>, Shape: (4885,)
Username: 007rxhmr, Vector: [-1. -1. -1. ... -1. -1. -1.], Type: <class 'numpy.ndarray'>, Shape: (4885,)
Username: 03_sats, Vector: [-1. -1. -1. ... -1. -1. -1.], Type: <class 'numpy.ndarray'>, Shape: (4885,)
Username: 03c_e, Vector: [-1. -1. -1. ... -1. -1. -1.], Type: <class 'numpy.ndarray'>, Shape: (4885,)


In [None]:
class RankingModel(tfrs.Model):
    def __init__(self, user_model, movie_model, task):
        super().__init__()
        self.user_model = user_model
        self.movie_model = movie_model
        self.task = task
        # Initialize output layer here to ensure it's only created once
        self.output_layer = layers.Dense(1)

    def call(self, features):
        print('e')
        # print shapes
        print(features[0]["user_features"].shape)
        print(features[0]["movie_features"].shape)
        
        user_embeddings = self.user_model(features[0]["user_features"])
        movie_embeddings = self.movie_model(features[0]["movie_features"])
        print('e')
        combined_embeddings = tf.concat([user_embeddings, movie_embeddings], axis=1)
        # Use the initialized output layer
        outputs = self.output_layer(combined_embeddings)
        return tf.squeeze(outputs, axis=-1)  # Ensure matching shapes with labels

    def compute_loss(self, features, training=False):
        labels = features[1]
        predictions = self(features)
        return self.task(labels, predictions)



# Define the components of the model
user_model = tf.keras.Sequential([
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu')  # Ensure this matches movie_model's output
])

movie_model = tf.keras.Sequential([
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu')  # Ensure this matches user_model's output
])

# Ensure the task setup is correct
task = tfrs.tasks.Ranking(
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

# Instantiate and compile the model
model = RankingModel(user_model, movie_model, task)
# set learning rate
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.01))

# Re-run training and evaluation
model.fit(train_ds, epochs=1, verbose=1, batch_size=1)
model.evaluate(test_ds)



e
(32,)
(32, 69)
e
(32,)
(32, 69)


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Input 0 of layer "dense_20" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (32,)''


ValueError: Exception encountered when calling RankingModel.call().

[1mInput 0 of layer "dense_20" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (32,)[0m

Arguments received by RankingModel.call():
  • features=({'user_features': 'tf.Tensor(shape=(32,), dtype=float32)', 'movie_features': 'tf.Tensor(shape=(32, 69), dtype=int64)'}, 'tf.Tensor(shape=(32,), dtype=float32)')

In [None]:
# generate recommendations for a username function by taking their user vector, and every single movie vector
# and getting the rating for each movie vector
def generate_recommendations(username, ds, top_k=10):
    
    recommendations = {}
    for movie_name, movie_vector in movie_vectors.items():
        # Ensure movie_vector is a tensor with the right shape
        movie_vector = tf.convert_to_tensor([movie_vector], dtype=tf.float32)

        # Prepare features dictionary correctly for model prediction
        features = ({'user_features': user_vector, 'movie_features': movie_vector})
        
        # Model prediction
        rating = model.predict(features)[0]  # Model.predict returns a batch of predictions
        
        recommendations[movie_name] = rating

    # Return top k recommendations sorted by predicted rating
    return dict(sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:top_k])


# make df for particular user
user = 'nconterno'
# get user vector for user
user_vector = user_vectors[user]


# Connect to SQLite database and load movie details
conn = sqlite3.connect('my_letterboxd_data.db')

query_ratings = """
SELECT username, movie_name, rating
FROM users
"""
ratings_df = pd.read_sql(query_ratings, conn)
conn.close()
ratings_df = ratings_df[ratings_df['movie_name'].isin(movie_index.keys())]
ratings_df = ratings_df[ratings_df['username'].isin(user_vectors.keys())]
print('a')
nc_vectors_df = pd.DataFrame(user_vectors).T
nc_vectors_df = nc_vectors_df.reset_index().rename(columns={'index': 'username'})
# Merge the user vectors with the ratings data
nc_vectors_df = pd.merge(ratings_df, nc_vectors_df, on='username', how='left')
print(ratings_df.shape)
# Check the updated DataFrame
print(ratings_df.head())
print(ratings_df.shape)

# merge ratings_df with movies_details_df
nc_vectors_df = pd.merge(nc_vectors_df, movies_details_df, on='movie_name', how='left')

# remove non-numeric columns
non_numeric_columns = nc_vectors_df.select_dtypes(exclude=[np.number]).columns

nc = df_to_dataset(nc_vectors_df)
print('b')
# Generate recommendations
recommendations = generate_recommendations(user, user_vectors, movie_vectors, top_k=10)




(4177037, 3)
    username                                       movie_name  rating
12  nsdamera                                            wonka     NaN
14  nsdamera  the-hunger-games-the-ballad-of-songbirds-snakes     NaN
15  nsdamera                           five-nights-at-freddys     NaN
19  nsdamera                                        priscilla     NaN
20  nsdamera                                         saltburn     NaN
(4177037, 3)


In [None]:
for features, labels in train_ds.take(1):
    print("Features:", features.keys())
    print("Labels shape:", labels.shape)
    print("User features shape:", features['user_features'].shape)
    print("Movie features shape:", features['movie_features'].shape)


Features: dict_keys(['user_features', 'movie_features', 'rating'])
Labels shape: (32,)
User features shape: (32, 154)
Movie features shape: (32, 67)


2024-04-20 19:48:04.137382: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:

embedding_dimension = 32


# Define user and movie models
user_model = tf.keras.Sequential([
    layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
    layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

movie_model = tf.keras.Sequential([
    layers.StringLookup(vocabulary=unique_movie_titles, mask_token=None),
    layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

# Define the task with a loss function
task = tfrs.tasks.Ranking(
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)


# Create a model instance
model = MovieRankingModel(user_model, movie_model, task)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

# Fit the model
model.fit(train_ds, epochs=3, verbose=1)

# Evaluate the model
model.evaluate(test_ds, return_dict=True)


KeyError: 'movie_name'

In [None]:
# make predictions for a user
user = 'nconterno'
user_data = user_vectors[user]
user_data = user_data.reshape(1, -1)

# Create a DataFrame from the user data
user_df = pd.DataFrame(user_data, columns=user_vectors_df.columns[1:])
print(user_df.shape)

# Merge the user data with the movie details
user_df = pd.merge(user_df, movies_details_df, left_index=True, right_index=True, how='left')
print(user_df.shape)

# Check for non-numeric columns in the user_df after merging
non_numeric_columns = user_df.select_dtypes(include=['object']).columns
print("Non-numeric columns found:", non_numeric_columns)

# Drop these non-numeric columns
user_df.drop(columns=non_numeric_columns, inplace=True)

# Make predictions
predictions = model.predict(user_df)
print(predictions)

(1, 154)
(1, 226)


ValueError: could not convert string to float: 'nope'

In [None]:
# DATA DESCRIPTION
# ratings_df: DataFrame containing user ratings
# movies_details_df: DataFrame containing movie details
# user_vector_df: DataFrame containing user vectors

# create dataframe from feature to rating for use in model
# Merge ratings with movie details

# ignore future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def create_feature_rating_df(ratings_df, movies_details_df):
    # loop through each unique user vector 
    feature_rating_df = pd.DataFrame()
    # get columns of user_vector_df
    columns = user_vector_df.columns
    # get columns of movies_details_df
    columns.append(movies_details_df.columns)
   
    # create new dataframe
    df = pd.DataFrame(columns=columns)
    # add rating column to the empty dataframe
    df['rating'] = 0
    # print(df)

    for i in tqdm(range(user_vector_df.shape[0])):
        # get user vector
        user_vector = user_vector_df.iloc[i]
        # get username
        username = user_vector_df.index[i]
        # print(username)
        # get all movie ratings for user
        user_ratings = ratings_df[ratings_df['username'] == username]
        # print(user_ratings)
        # loop through each movie rating
        for j in range(user_ratings.shape[0]):
            user_rating = user_ratings.iloc[j]
            # print(user_rating)
            movie_name = user_rating['movie_name']
            currUserVector = user_vector
            # mask current user vector  movie name column to -1
            currUserVector[movie_index[movie_name]] = -1

            # get movie details
            movie_details = movies_details_df[movies_details_df['movie_name'] == movie_name]
            # create new row with user vector, movie details, and rating
            # print(type(user_vector))
            # print(type(movie_details))
            # print(type(user_rating['rating']))
            # convert movie_details to a series
            movie_details = movie_details.squeeze()
            new_row = user_vector.append(movie_details)
            new_row['rating'] = user_rating['rating']
            # give name to the new row
            new_row.name = username + '_' + movie_name
            feature_rating_df.append(new_row)


    
    return df

feature_rating_df = create_feature_rating_df(ratings_df, movies_details_df)


  0%|          | 1/16781 [00:08<41:07:51,  8.82s/it]


KeyboardInterrupt: 

In [None]:
# print all columns in train that are strings
print(train.select_dtypes(include=['object']).columns)

Index(['movie_name'], dtype='object')


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoding director
label_encoder_director = LabelEncoder()
movies_details_df['director_encoded'] = label_encoder_director.fit_transform(movies_details_df['director'])


In [None]:
# Example: Assuming you've added encoded features as new columns to the dataframe
feature_columns = [col for col in movies_details_df.columns if 'encoded' in col]  # Add other relevant columns if necessary
movie_features = movies_details_df[feature_columns].values


In [None]:
# Example of storing feature vectors for later use
movie_features_df = pd.DataFrame(movie_features, index=movies_details_df['movie_name'])
movie_features_df.to_csv('movie_feature_vectors.csv')

                          0
movie_name                 
Go Fishboy            85354
Her Morning Elegance  71959
Gemini Man             6291
Goosebumps            80067
Fuelled               65970
             movie_name                                           director  \
0            Go Fishboy  Sebastian Doringer, Denise Cirone, Andrey Kole...   
1  Her Morning Elegance             Oren Lavie, Merav Nathan, Yuval Nathan   
2            Gemini Man                                            Ang Lee   
3            Goosebumps                                      Rob Letterman   
4               Fuelled                            Michelle Hao, Fawn Chan   

                                              actors  \
0  [Kazuki Teramoto, Yu-Jan Hsiung, Yume Nanbu, Y...   
1                         [Oren Lavie, Shir Shomron]   
2  [Will Smith, Mary Elizabeth Winstead, Clive Ow...   
3  [Jack Black, Dylan Minnette, Odeya Rush, Amy R...   
4                                                 []   

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate

# Assuming you have the correct number of features from your movie_features and user_vectors
num_user_features = len(user_vector_df.columns)
num_movie_features = len(movie_features_df.columns)

# Inputs
user_input = Input(shape=(num_user_features,), name='user_input')
movie_input = Input(shape=(num_movie_features,), name='movie_input')

# User branch
user_dense = Dense(128, activation='relu')(user_input)
user_dropout = Dropout(0.5)(user_dense)

# Movie branch
movie_dense = Dense(128, activation='relu')(movie_input)
movie_dropout = Dropout(0.5)(movie_dense)

# Combine branches
concatenated = Concatenate()([user_dropout, movie_dropout])
combined_dense = Dense(256, activation='relu')(concatenated)
final_dropout = Dropout(0.3)(combined_dense)
output = Dense(1, activation='linear')(final_dropout)  # Predicting a rating

# Create and compile the model
model = Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error')

# Summary
model.summary()


In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train_ds = df_to_dataset(train, batch_size=256)
test_ds = df_to_dataset(test, shuffle=False, batch_size=256)

print(df.head(2))

# start model training 


     username  movie_name  rating  director  actors  genres
233     10086      109562     8.0       282     199      89
235     10086       22958     6.0       282     199      89
Epoch 1/5


ValueError: Missing data for input "user_input". You passed a data dictionary with keys ['username', 'movie_name', 'director', 'actors', 'genres']. Expected the following keys: ['user_input', 'movie_input']

In [None]:
# Ensure 'movie_name' is the index for movie_features_df
movie_features_df = movie_features_df.set_index(movies_details_df['movie_name'])

# Now check if the index is set correctly
print(movie_features_df.head())

# Check if the train and test movie names exist in the movie_features_df index
print(train['movie_name'].isin(movie_features_df.index).value_counts())
print(test['movie_name'].isin(movie_features_df.index).value_counts())


                          0
movie_name                 
Go Fishboy            85354
Her Morning Elegance  71959
Gemini Man             6291
Goosebumps            80067
Fuelled               65970
False    1943880
Name: movie_name, dtype: int64
False    485971
Name: movie_name, dtype: int64


In [None]:
embedding_dimension = 2

user_model = tf.keras.Sequential([
    layers.Embedding(input_dim=df['username'].max()+1, output_dim=embedding_dimension),
    layers.Flatten()
])

movie_model = tf.keras.Sequential([
    layers.Embedding(input_dim=df['movie_name'].max()+1, output_dim=embedding_dimension),
    layers.Flatten()
])

rating_model = tf.keras.Sequential([
    layers.Dense(256, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model = RecommenderModel(user_model, movie_model, rating_model)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001))


# Train the model
model.fit(train_ds, epochs=1, validation_data=test_ds)
# model.save('/kaggle/working/my_recommender_model.h5')


2024-04-17 11:31:48.153306: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:1: Filling up shuffle buffer (this may take a while): 4926690 of 12120589


[1m   24/47347[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:36[0m 7ms/step - loss: 50.3460 - regularization_loss: 0.0000e+00 - total_loss: 50.3460  

2024-04-17 11:32:02.613656: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


[1m 8056/47347[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m7:55[0m 12ms/step - loss: 4.6790 - regularization_loss: 0.0000e+00 - total_loss: 4.6790

KeyboardInterrupt: 

In [None]:
def inspect_dataset(dataset):
    for features, label in dataset.take(1):
        print({k: v.numpy().shape for k, v in features.items()})
        print('Label shape:', label.numpy().shape)

inspect_dataset(train_ds)
inspect_dataset(test_ds)


2024-04-16 17:37:52.667216: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] ShuffleDatasetV3:26: Filling up shuffle buffer (this may take a while): 5527740 of 12120589
2024-04-16 17:38:02.666972: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] ShuffleDatasetV3:26: Filling up shuffle buffer (this may take a while): 11245570 of 12120589
2024-04-16 17:38:04.258159: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] Shuffle buffer filled.


{'username': (256,), 'movie_name': (256,), 'director': (256,), 'actors': (256,), 'genres': (256,)}
Label shape: (256,)
{'username': (256,), 'movie_name': (256,), 'director': (256,), 'actors': (256,), 'genres': (256,)}
Label shape: (256,)


In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import joblib  # Ensure joblib is imported
from tqdm import tqdm

def make_predictions_for_user(username, top_k, user_encoder, movie_encoder, model, movies_df):

    try:
        # Attempt to encode the user
        encoded_user = user_encoder.transform([username])[0]
    except ValueError:
        # Handle unseen user by returning an empty DataFrame or a meaningful message
        print(f"Username {username} not seen during training.")
        return pd.DataFrame()

    all_movies = movies_df['movie_name'].unique()

    # encode all movies that were seen during training (movie_names)
    encoded_movies = movie_encoder.transform(movie_names)

    # Prepare dataset for prediction
    user_movie_pairs = np.hstack((
        np.repeat(encoded_user, len(movie_names)).reshape(-1, 1),
        np.array(encoded_movies).reshape(-1, 1)
    ))

    # Convert to a TensorFlow dataset
    predict_ds = tf.data.Dataset.from_tensor_slices((dict(username=user_movie_pairs[:, 0], movie_name=user_movie_pairs[:, 1])))
    predict_ds = predict_ds.batch(256)  # Use the same batch size as during training

    # Make predictions
    predictions = model.predict(predict_ds)

    # Create a DataFrame with movies and their predicted ratings
    predictions_df = pd.DataFrame({
        'movie_name': movie_names,
        'predicted_rating': predictions.flatten()
    })

    # Sort movies based on predicted ratings
    top_k_predictions = predictions_df.sort_values(by='predicted_rating', ascending=False).head(top_k)

    return top_k_predictions

# Load the saved encoders and model
user_encoder = joblib.load('username_encoder.joblib')
movie_encoder = joblib.load('movie_name_encoder.joblib')
# model = tf.keras.models.load_model('/kaggle/working/my_recommender_model')

# Example usage
username = 'nconterno'  # replace with an actual username
top_k = 5  # Number of top recommendations to retrieve

top_k_recommendations = make_predictions_for_user(username, top_k, user_encoder, movie_encoder, model, movies_details_df)
print(top_k_recommendations)



                                        movie_name  predicted_rating
2155                                  come-and-see          9.086839
125                                  parasite-2019          9.039491
334                                  the-godfather          8.990693
129               spider-man-into-the-spider-verse          8.945284
164   the-lord-of-the-rings-the-return-of-the-king          8.926230


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib  # Ensure joblib is imported
from tqdm import tqdm

def make_predictions_for_user(username, top_k, user_encoder, movie_encoder, model, movies_df, ratings_df):
    try:
        # Attempt to encode the user
        encoded_user = user_encoder.transform([username])[0]
    except ValueError:
        # Handle unseen user by returning an empty DataFrame or a meaningful message
        print(f"Username {username} not seen during training.")
        return pd.DataFrame()

    # Retrieve all movies seen by the user
    seen_movies = ratings_df[ratings_df['username'] == username]['movie_name'].unique()
    
    # Retrieve all movies and exclude those that have been seen
    # all_movies = movies_df['movie_name'].unique()
    unseen_movies = np.setdiff1d(movie_names, seen_movies)
    
    if len(unseen_movies) == 0:
        print("No unseen movies found to recommend.")
        return pd.DataFrame()

    # Encode all unseen movies that were seen during training
    try:
        encoded_movies = movie_encoder.transform(unseen_movies)
    except ValueError:
        print("Error in encoding movies.")
        return pd.DataFrame()

    # Prepare dataset for prediction
    user_movie_pairs = np.hstack((
        np.repeat(encoded_user, len(unseen_movies)).reshape(-1, 1),
        np.array(encoded_movies).reshape(-1, 1)
    ))

    # Convert to a TensorFlow dataset
    predict_ds = tf.data.Dataset.from_tensor_slices((dict(username=user_movie_pairs[:, 0], movie_name=user_movie_pairs[:, 1])))
    predict_ds = predict_ds.batch(256)  # Use the same batch size as during training

    # Make predictions
    predictions = model.predict(predict_ds)

    # Create a DataFrame with movies and their predicted ratings
    predictions_df = pd.DataFrame({
        'movie_name': unseen_movies,
        'predicted_rating': predictions.flatten()
    })

    # Sort movies based on predicted ratings
    top_k_predictions = predictions_df.sort_values(by='predicted_rating', ascending=False).head(top_k)

    return top_k_predictions

# Load the saved encoders and model
user_encoder = joblib.load('username_encoder.joblib')
movie_encoder = joblib.load('movie_name_encoder.joblib')

# Example usage
username = 'nconterno'
top_k = 5000

top_k_recommendations = make_predictions_for_user(username, top_k, user_encoder, movie_encoder, model, movies_details_df, ratings_df)
print(top_k_recommendations)

                                 movie_name  predicted_rating
47145                          come-and-see          9.086839
160153                         perfect-blue          8.860308
216032                the-godfather-part-ii          8.821800
158292                          paris-texas          8.815205
221186                  the-last-dance-2020          8.806071
...                                     ...               ...
177306                           saint-judy          6.210822
70053              everybody-dies-sometimes          6.210759
246586  tomorrow-ill-be-someones-girlfriend          6.210515
19913          arsenie-an-amazing-afterlife          6.210468
213453        the-fast-and-the-furious-2001          6.210443

[5000 rows x 2 columns]


In [None]:
#print genre of 'nirvana-unplugged-in-new-york'
print(movies_details_df[movies_details_df['movie_name'] == 'come-and-see'])

Empty DataFrame
Columns: [movie_name, director, actors, genres]
Index: []


In [None]:
import sqlite3
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers
import tensorflow_recommenders as tfrs
from sklearn.model_selection import train_test_split

# Connect to SQLite database and load movie details
conn = sqlite3.connect('my_letterboxd_data.db')

# Load ratings data
query_ratings = """
SELECT username, movie_name, rating
FROM users
"""
ratings_df = pd.read_sql(query_ratings, conn)

# Load movie details
query_movie_details = """
SELECT movie_name, director, actors, genres
FROM film_details
"""
movies_details_df = pd.read_sql(query_movie_details, conn)

conn.close()

# Data preprocessing
ratings_df['rating'] = ratings_df['rating'].astype(float)
ratings_df = ratings_df.dropna(subset=['rating'])
movies_details_df.fillna('', inplace=True)  # Handle missing values
print(ratings_df.head(2))
# Merge ratings with movie details
df = pd.merge(ratings_df, movies_details_df, on='movie_name', how='left')
movie_names = df['movie_name'].unique()
# Encoding categorical features
from sklearn.preprocessing import LabelEncoder
import joblib  # Import joblib

def encode_and_save_column(column, name):
    encoder = LabelEncoder()
    transformed = encoder.fit_transform(column)
    joblib.dump(encoder, f'{name}_encoder.joblib')  # Save the encoder
    return transformed, len(encoder.classes_)

for feature in ['username', 'movie_name', 'director', 'actors', 'genres']:
    df[feature], num_classes = encode_and_save_column(df[feature],feature)
    df[feature] = df[feature].astype('int64')

print('hi')
# Convert to TensorFlow dataset
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('rating')
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

train, test = train_test_split(df, test_size=0.2, random_state=42)
train_ds = df_to_dataset(train, batch_size=256)
test_ds = df_to_dataset(test, shuffle=False, batch_size=256)

print('hi2')
# Model
class RecommenderModel(tfrs.Model):
    def __init__(self, user_model, movie_model, director_model, genre_model, actor_model, rating_model):
        super().__init__()
        self.movie_model = movie_model
        self.user_model = user_model
        self.director_model = director_model
        self.genre_model = genre_model
        self.actor_model = actor_model
        self.rating_model = rating_model
        self.task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features):
        # Retrieve embeddings for all features
        user_embeddings = self.user_model(features["username"])
        movie_embeddings = self.movie_model(features["movie_name"])
        director_embeddings = self.director_model(features["director"])
        genre_embeddings = self.genre_model(features["genres"])
        actor_embeddings = self.actor_model(features["actors"])

        # Combine all embeddings and pass to the rating prediction model
        combined_embeddings = tf.concat(
            [user_embeddings, movie_embeddings, director_embeddings, genre_embeddings, actor_embeddings],
            axis=1)
        return self.rating_model(combined_embeddings)

    def compute_loss(self, data, training=False):
        features, labels = data
        rating_predictions = self(features)
        return self.task(labels=labels, predictions=rating_predictions)

# Define the models for users and movies
embedding_dimension = 256

user_model = tf.keras.Sequential([
    layers.Embedding(input_dim=df['username'].max()+1, output_dim=embedding_dimension),
    layers.Flatten()
])

movie_model = tf.keras.Sequential([
    layers.Embedding(input_dim=df['movie_name'].max()+1, output_dim=embedding_dimension),
    layers.Flatten()
])
# Define the embedding layers for each new feature
director_model = tf.keras.Sequential([
    layers.Embedding(input_dim=df['director'].max()+1, output_dim=embedding_dimension),
    layers.Flatten()
])

genre_model = tf.keras.Sequential([
    layers.Embedding(input_dim=df['genres'].max()+1, output_dim=embedding_dimension),
    layers.Flatten()
])

actor_model = tf.keras.Sequential([
    layers.Embedding(input_dim=df['actors'].max()+1, output_dim=embedding_dimension),
    layers.Flatten()
])

rating_model = tf.keras.Sequential([
    layers.Dense(256, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model = RecommenderModel(user_model, movie_model, director_model, genre_model, actor_model, rating_model)
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(0.001))

# Train the model
model.fit(train_ds, epochs=1, validation_data=test_ds, steps_per_epoch=100)
# model.save('/kaggle/working/my_recommender_model.h5')


   username       movie_name  rating
0  mmoorthy  mean-girls-2024     6.0
1  mmoorthy   anyone-but-you     4.0
hi


2024-04-11 23:39:43.094854: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-04-11 23:39:43.094913: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-04-11 23:39:43.094921: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-04-11 23:39:43.095156: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-11 23:39:43.095174: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


hi2


2024-04-11 23:39:44.937282: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2024-04-11 23:39:45.000586: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp_10.
2024-04-11 23:39:55.042826: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] ShuffleDatasetV3:1: Filling up shuffle buffer (this may take a while): 5937728 of 12120589
2024-04-11 23:40:05.364316: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] Shuffle buffer filled.




<keras.src.callbacks.History at 0x28b348850>

In [None]:
director_names = df['director'].unique()
actor_names = df['actors'].unique()
genre_names = df['genres'].unique()

In [None]:
def make_predictions_for_user(username, top_k, user_encoder, movie_encoder, director_encoder, genre_encoder, actor_encoder, model, movies_df, ratings_df):
    try:
        # Attempt to encode the user
        encoded_user = user_encoder.transform([username])[0]
    except ValueError:
        print(f"Username {username} not seen during training.")
        return pd.DataFrame()

    # Retrieve all movies seen by the user
    seen_movies = ratings_df[ratings_df['username'] == username]['movie_name'].unique()
    
    # Filter out movies that have been seen
    unseen_movies_df = movies_df[~movies_df['movie_name'].isin(seen_movies)]
    # make sure unseen_movies_df only contains directors, actors, and genres that were seen during training
    unseen_movies_df = unseen_movies_df[unseen_movies_df['director'].isin(director_names)]
    unseen_movies_df = unseen_movies_df[unseen_movies_df['actors'].isin(actor_names)]
    unseen_movies_df = unseen_movies_df[unseen_movies_df['genres'].isin(genre_names)]
    if unseen_movies_df.empty:
        print("No unseen movies found to recommend.")
        return pd.DataFrame()

    # Encode all features of the unseen movies
    try:
        encoded_movie_names = movie_encoder.transform(unseen_movies_df['movie_name'])
        encoded_directors = director_encoder.transform(unseen_movies_df['director'])
        encoded_genres = genre_encoder.transform(unseen_movies_df['genres'])
        encoded_actors = actor_encoder.transform(unseen_movies_df['actors'])
    except ValueError:
        print("Error in encoding one of the features.")
        return pd.DataFrame()

    # Prepare dataset for prediction
    features_dict = {
        'username': np.repeat(encoded_user, len(unseen_movies_df)),
        'movie_name': encoded_movie_names,
        'director': encoded_directors,
        'genre': encoded_genres,
        'actor': encoded_actors
    }

    # Convert to a TensorFlow dataset
    predict_ds = tf.data.Dataset.from_tensor_slices(features_dict)
    predict_ds = predict_ds.batch(256)  # Use the same batch size as during training

    # Make predictions
    predictions = model.predict(predict_ds)

    # Create a DataFrame with movies and their predicted ratings
    predictions_df = pd.DataFrame({
        'movie_name': unseen_movies_df['movie_name'].values,  # use original names for clarity
        'predicted_rating': predictions.flatten()
    })

    # Sort movies based on predicted ratings
    top_k_predictions = predictions_df.sort_values(by='predicted_rating', ascending=False).head(top_k)

    return top_k_predictions

# Load the saved encoders and model
user_encoder = joblib.load('username_encoder.joblib')
movie_encoder = joblib.load('movie_name_encoder.joblib')
director_encoder = joblib.load('director_encoder.joblib')
genre_encoder = joblib.load('genres_encoder.joblib')
actor_encoder = joblib.load('actors_encoder.joblib')
# model = tf.keras.models.load_model('/kaggle/working/my_recommender_model.h5')

# Example usage
username = 'nconterno'
top_k = 5

top_k_recommendations = make_predictions_for_user(username, top_k, user_encoder, movie_encoder, director_encoder, genre_encoder, actor_encoder, model, movies_details_df, ratings_df)
print(top_k_recommendations)


No unseen movies found to recommend.
Empty DataFrame
Columns: []
Index: []
