In [2]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_recommenders as tfrs
from sklearn.preprocessing import LabelEncoder
# Connect to SQLite database
conn = sqlite3.connect('my_letterboxd_data.db')

# Load ratings data
query_ratings = """
SELECT username, movie_name, rating
FROM users
"""
ratings_df = pd.read_sql(query_ratings, conn)

# Load movie details
query_movie_details = """
SELECT movie_name, genres, actors
FROM film_details
"""
movies_details_df = pd.read_sql(query_movie_details, conn)
conn.close()
print('loaded data')

# Process genres and 
# actors from comma-separated strings to lists
# Process genres and actors from comma-separated strings to lists
movies_details_df['genres'] = movies_details_df['genres'].apply(lambda x: x.split(', ') if x else [])
movies_details_df['actors'] = movies_details_df['actors'].apply(lambda x: x.split(', ') if x else [])
print('processed data')
# Calculate actor frequency
actor_frequency = pd.Series(np.concatenate(movies_details_df['actors'])).value_counts()
top_actors = actor_frequency.head(50).index.tolist()  # Get the names of the top 50 actors
print('calculated actor frequency')
# Filter actor lists to include only top 50 actors
movies_details_df['actors'] = movies_details_df['actors'].apply(lambda x: [actor for actor in x if actor in top_actors])
print('filtered actors')
# Encode genres using MultiLabelBinarizer
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(movies_details_df['genres'])
genre_columns = ['genre_' + col for col in mlb_genres.classes_]
movies_details_df = movies_details_df.join(pd.DataFrame(genres_encoded, columns=genre_columns, index=movies_details_df.index))
print('encoded genres')
# Encode actors using MultiLabelBinarizer
mlb_actors = MultiLabelBinarizer()
actors_encoded = mlb_actors.fit_transform(movies_details_df['actors'])
actor_columns = ['actor_' + col for col in mlb_actors.classes_]
movies_details_df = movies_details_df.join(pd.DataFrame(actors_encoded, columns=actor_columns, index=movies_details_df.index))


print   ('encoded actors')
# Merge ratings with movie details
merged_df = pd.merge(ratings_df, movies_details_df, on='movie_name', how='left')
print('merged data')
# Split data into training and testing
train_df, test_df = train_test_split(merged_df, test_size=0.2)
print('split data')
# Function to convert DataFrame to TensorFlow dataset
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('rating').values.astype(np.float32)  # Ensure labels are float32

    # Handle categorical data: Encode user and movie names as categories
    le_user = LabelEncoder()
    dataframe['username'] = le_user.fit_transform(dataframe['username']).astype(np.int32)
    
    le_movie = LabelEncoder()
    dataframe['movie_name'] = le_movie.fit_transform(dataframe['movie_name']).astype(np.int32)
    
    # Convert all other columns to float32 if not already
    for col in dataframe.columns:
        dataframe[col] = dataframe[col].astype(np.float32)

    features = {name: np.array(value) for name, value in dataframe.items()}  # Ensure all features are numpy arrays
    ds = tf.data.Dataset.from_tensor_slices((features, labels))
    
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    
    ds = ds.batch(batch_size)
    return ds

train_ds = df_to_dataset(train_df)
test_ds = df_to_dataset(test_df)
print('converted to dataset')
# TensorFlow Recommenders Model
class RecommenderModel(tfrs.Model):
    def __init__(self):
        super().__init__()
        self.user_embedding = layers.Embedding(input_dim=len(train_df['username'].unique()), output_dim=32)
        self.movie_embedding = layers.Embedding(input_dim=len(train_df['movie_name'].unique()), output_dim=32)
        self.dense = layers.Dense(128, activation='relu')
        self.outputs = layers.Dense(1)

    def call(self, inputs):
        user_vec = self.user_embedding(inputs['username'])
        movie_vec = self.movie_embedding(inputs['movie_name'])
        x = tf.concat([user_vec, movie_vec], axis=1)
        x = self.dense(x)
        return self.outputs(x)

model = RecommenderModel()
model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError(), metrics=['accuracy'])
model.fit(train_ds, epochs=5, validation_data=test_ds)

# Evaluate the model
print("Evaluation:", model.evaluate(test_ds))


loaded data
processed data
calculated actor frequency
filtered actors
encoded genres
encoded actors
merged data
split data
