In [None]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate

In [None]:
# https://grouplens.org/datasets/movielens/latest/
ratings_df = pd.read_csv('./ratings.csv', delimiter=',')
movies_df = pd.read_csv('./movies.csv')
tags_df = pd.read_csv('./tags.csv')

In [None]:
# Combine genres + tags into one feature text per movie
# Merge tags per movie into a single string
tags_grouped = tags_df.groupby('movieId')['tag'].apply(lambda x: '|'.join(x)).reset_index()

# Merge with movies_df (genres are part of movies)
movies_with_tags = pd.merge(movies_df, tags_grouped, on='movieId', how='left')
movies_with_tags['tag'] = movies_with_tags['tag'].fillna('')
movies_with_tags['features'] = movies_with_tags['genres'].str.lower() + '|' + movies_with_tags['tag'].str.lower()

In [None]:
# Ensure we have a mapping movieId -> row index in movies_with_tags
movie_id_to_index = {mid: i for i, mid in enumerate(movies_with_tags['movieId'].values)}

In [None]:
# Create one-hot / bag-of-words representation for movie features
vectorizer = CountVectorizer(token_pattern=r'[^|]+')
X_m = vectorizer.fit_transform(movies_with_tags['features'])
feature_names = vectorizer.get_feature_names_out()

In [None]:
np.savetxt('features.txt', feature_names, fmt='%s', newline='\n') 

In [None]:
# Create user feature matrix X_u
# We'll take weighted average of movie features by user ratings
num_users = ratings_df['userId'].nunique()
num_features = X_m.shape[1]
X_u = np.zeros((num_users, num_features))

for uid, group in ratings_df.groupby('userId'):
    rated_movies = group['movieId'].values
    ratings = group['rating'].values
    
    # filter to keep only movies that exist in movie_id_to_index and preserve the rating order
    movie_to_rating_pairs = [(mid, r) for mid, r in zip(rated_movies, ratings) if mid in movie_id_to_index]
    if not movie_to_rating_pairs:
        continue
    movie_ids, weights = zip(*movie_to_rating_pairs)
    movie_indices = [movie_id_to_index[movie_id] for movie_id in movie_ids]
    
    movie_features = X_m[movie_indices].toarray()
    user_profile = np.average(movie_features, axis = 0, weights = np.array(weights))
    X_u[uid - 1] = user_profile

In [None]:
# For each rating row, look up the user's profile and the movie's feature vector
user_indices = ratings_df['userId'].values - 1
movie_indices = ratings_df['movieId'].map(movie_id_to_index).values
# Build aligned (user, item) input arrays â€” one sample per rating row
# user_inputs[i] corresponds to item_inputs[i] and y[i]
user_inputs = X_u[user_indices]
item_inputs = X_m[movie_indices].toarray()
y_all = ratings_df['rating'].values

In [None]:
print('aligned shapes (user_inputs, item_inputs, y):', user_inputs.shape, item_inputs.shape, y_all.shape)

In [None]:
# Single train/test split applied to aligned arrays
user_train, user_test, item_train, item_test, y_train, y_test = \
    train_test_split(user_inputs, item_inputs, y_all, train_size=0.8, shuffle=True)

# Convert dtypes to float32 (TF/Keras friendly)
user_train = user_train.astype(np.float32)
item_train = item_train.astype(np.float32)
y_train = y_train.astype(np.float32)
user_test = user_test.astype(np.float32)
item_test = item_test.astype(np.float32)
y_test = y_test.astype(np.float32)

# Scale features: fit scalers on training data and transform test
scalerItem = StandardScaler(with_mean=False)
scalerItem.fit(item_train)
item_train_scaled = scalerItem.transform(item_train)
item_test_scaled = scalerItem.transform(item_test)

scalerUser = StandardScaler(with_mean=False)
scalerUser.fit(user_train)
user_train_scaled = scalerUser.transform(user_train)
user_test_scaled = scalerUser.transform(user_test)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train_scaled = scalerTarget.transform(y_train.reshape(-1, 1)).reshape(-1)
y_test_scaled = scalerTarget.transform(y_test.reshape(-1, 1)).reshape(-1)

In [None]:
print('train shapes (user, item, y):', user_train.shape, item_train.shape, y_train.shape)
print('test shapes  (user, item, y):', user_test.shape, item_test.shape, y_test.shape)

In [None]:
# Build neural networks
num_outputs = 32
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

In [None]:
num_user_features = user_train.shape[1]
num_item_features = item_train.shape[1]

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features,))
vu = user_NN(input_user)
vu = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis = 1))(vu)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features,))
vm = item_NN(input_item)
vm = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis = 1))(vm)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

In [None]:
# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

In [None]:
model.compile(
    optimizer = keras.optimizers.Adam(learning_rate=0.01), 
    loss = tf.keras.losses.MeanSquaredError()
)

In [None]:
early_stopping_callback = EarlyStopping(
    monitor = 'loss',
    patience = 3,
    min_delta = 0.0001,
    restore_best_weights = True
)
model.fit(
    [user_train, item_train], 
    y_train, 
    epochs = 30,
    callbacks = [early_stopping_callback]
)

In [None]:
model.evaluate([user_test, item_test], y_test)

In [None]:
feature_to_index = {feat: i for i, feat in enumerate(feature_names)}

In [None]:
# Create a new user
new_user_profile = np.zeros(num_features)
new_user_profile[feature_to_index.get('1990s')] = 4
new_user_profile[feature_to_index.get('animation')] = 5
new_user_profile[feature_to_index.get('comedy')] = 5
new_user_profile[feature_to_index.get('kids')] = 5
new_user_profile[feature_to_index.get('adorable')] = 5
new_user_profile[feature_to_index.get('funny')] = 5
new_user_profile[feature_to_index.get('whimsical')] = 5
new_user_profile[feature_to_index.get('romantic')] = 5
new_user_profile[feature_to_index.get('romantic comedy')] = 5
new_user_profile[feature_to_index.get('macaulay culkin')] = 4
new_user_profile[feature_to_index.get('humour')] = 5
new_user_profile[feature_to_index.get('great humor')] = 5
new_user_profile[feature_to_index.get('cartoon')] = 5
new_user_profile[feature_to_index.get('anime')] = 5
new_user_profile[feature_to_index.get('disney')] = 5
new_user_profile[feature_to_index.get('hayao miyazaki')] = 5
new_user_profile[feature_to_index.get('food')] = 5
new_user_profile[feature_to_index.get('quirky romantic')] = 5
new_user_profile[feature_to_index.get('sentimental')] = 5
new_user_profile[feature_to_index.get('adventure')] = 3

In [None]:
# Make predictions for the new user
user_profile_scaled = scalerUser.transform(new_user_profile.reshape(1, -1))

# Prepare item features
items_all = X_m.toarray().astype(np.float32)
items_all_scaled = scalerItem.transform(items_all)

# Duplicate user profile for every item and predict
n_items = items_all_scaled.shape[0]
users_rep = np.repeat(user_profile_scaled, n_items, axis = 0)
preds_scaled = model.predict([users_rep, items_all_scaled], batch_size=1024, verbose=0).reshape(-1)

# Undo target scaling to get back to original rating scale
predictions = scalerTarget.inverse_transform(preds_scaled.reshape(-1, 1)).reshape(-1)

In [None]:
# Get top recommendations
K = 20
rank_ix = np.argsort(predictions)[::-1]
recommendation_indices = [i for i in rank_ix][:K]
movie_titles = movies_df.set_index('movieId')['title'].to_dict()

# Map row indices -> titles and print
print("\nTop recommendations:")
for idx in recommendation_indices:
    # movies_with_tags has rows in the same order as X_m / X_m_reduced
    try:
        movie_id = movies_with_tags.loc[idx, 'movieId']
        title = movie_titles.get(movie_id, f"MovieId {movie_id}")
    except Exception:
        # fallback via index_to_movie_id if present
        try:
            movie_id = index_to_movie_id[idx]
            title = movie_titles.get(movie_id, f"MovieId {movie_id}")
        except Exception:
            title = f"Index {idx}"
    print(f"{title}: predicted {predictions[idx]:0.3f}")

In [None]:
# All the recommendations have a predicted rated of 5.0. The conclusion I draw from that is that the features I used have too little overlap
# i.e. many movies with highly-rated features are missing poorly-rated features and vice-versa
# So for this particular user, this model might be less accurate than the collaborative filtering model - at least until the user rates more features
# I might also get higher accuracy by reducing the number of features or applying the existing features to more movies