In [1]:
#Mount Google Drive for accessing netflix data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Install the LibRecommender library and compatible versions of tf and keras
!pip install tensorflow==2.15.0 keras==2.15.0 LibRecommender

Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting keras==2.15.0
  Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.0)
  Downloading wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15.0)
  Downloading tensorboard-2.15.2-py3-none-any.whl.metadata (1.7 kB)
Collecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow==2.15.0)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading tensorflow-2.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [3]:
#Imports
import numpy as np
import pandas as pd
import tensorflow as tf
from libreco.data import random_split, DatasetPure
from libreco.algorithms import NCF
#from libreco.evaluation import evaluate

Instructions for updating:
non-resource variables are not supported in the long term


In [4]:
#Some variables
data_dir = "drive/MyDrive/netflix" #replace with the directory that contains your Netflix prize data
model_dir = "drive/MyDrive/models" #replace this with the directory where you would like to save your NCF model files
model_name = "ncf_model" #replace this with the name of your ncf model

In [5]:
#Function to load all of the netflix prize data into a pandas df
def load_netflix_data(file_paths):
    """Load Netflix data from multiple files into a single DataFrame."""
    data = {'user': [], 'item': [], 'label': []}
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            movie_id = None
            for line in file:
                line = line.strip()
                if line.endswith(':'):
                    movie_id = int(line[:-1])
                else:
                    customer_id, rating, _ = line.split(',')
                    data['item'].append(movie_id)
                    data['user'].append(int(customer_id))
                    data['label'].append(int(rating))
    return pd.DataFrame(data)

In [6]:
#Use the function above to load the data into a df
data_files = [f'{data_dir}/combined_data_{i}.txt' for i in range(1, 5)]
df = load_netflix_data(data_files)

In [7]:
#Examine the df
df.head()

Unnamed: 0,user,item,label
0,1488844,1,3
1,822109,1,5
2,885013,1,4
3,30878,1,4
4,823519,1,3


Split the probe data as 'test'

In [8]:
#Function to load the probe.txt file (which contains a list of user/movie pairs to be held off and used for testing), into a pandas df
def load_probe(file_path):
    """Load `probe.txt` into a DataFrame."""
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Ignore empty or non-data lines
            line = line.strip()
            if line.endswith(':'):  # Skip MovieID lines
                current_movie_id = int(line[:-1])
            else:
                # CustomerID, MovieID
                customer_id = int(line)
                data.append({'item': current_movie_id, 'user': customer_id})
    return pd.DataFrame(data)

In [9]:
#Load the list of test user/movie pairs
probe_path = f'{data_dir}/probe.txt'
probe_df = load_probe(probe_path)

In [10]:
#Join the list of test user/movie pairs to the dataframe to create a df that contains the test user/movie pairs with the ground truth rating info
df_test = probe_df.merge(df, on=['item', 'user'], how='inner')

In [11]:
# Perform an anti-join to keep rows in df that are not in df_test
df_train = pd.merge(
    df,
    df_test,
    on=['user', 'item'],
    how='left',
    indicator=True
)

# Keep only rows that are present in df but not in df_test
df_train = df_train[df_train['_merge'] == 'left_only'].drop(columns='_merge')

df_train = df_train.rename(columns={'label_x': 'label'}).drop(columns=['label_y'], errors='ignore')

In [12]:
#Examine the training Dataframe
df_train.head()

Unnamed: 0,user,item,label
0,1488844,1,3
1,822109,1,5
2,885013,1,4
4,823519,1,3
5,893988,1,3


In [13]:
#Examine the testing dataframe
df_test.head()

Unnamed: 0,item,user,label
0,1,30878,4
1,1,2647871,4
2,1,1283744,3
3,1,2488120,5
4,1,317050,5


Model training

In [14]:
#Separate the training data into training and validation using libreco
train_data, val_data = random_split(df_train, multi_ratios=[0.8, 0.2])

In [16]:
#Re-order the testing dataframe to make it compatible with 'DatasetPure.build_testset'
df_test = df_test[['user', 'item', 'label']]

In [17]:
#Create libreco training, eval, and test data objects from the pandas dataframes
train_data, data_info= DatasetPure.build_trainset(train_data)
val_data = DatasetPure.build_evalset(val_data)
test_data = DatasetPure.build_testset(df_test)

In [19]:
#Create a libreco NCF model object with the specified hyperparameters
#Try changing them to see if you can beat my RMSE!!
ncf = NCF(
    task="rating",
    data_info=data_info,
    loss_type="mse",
    embed_size=32,
    n_epochs=3,
    lr=1e-3,
    batch_size=1024,
    dropout_rate=0.5,
    hidden_units=(128, 64),
    num_neg=1,
)

In [20]:
#Fit the NCF model to the training data
ncf.fit(
    train_data,
    neg_sampling=False, #for rating, this param is false else True
    verbose=2,
    eval_data=val_data,
    metrics=["loss"],
)

  net = tf.layers.batch_normalization(net, training=is_training)
Instructions for updating:
Colocations handled automatically by placer.


Training start time: [35m2025-01-31 06:12:10[0m


  net = tf.layers.batch_normalization(net, training=is_training)
  net = tf.layers.dropout(net, dropout_rate, training=is_training)
train: 100%|██████████| 77401/77401 [29:36<00:00, 43.57it/s]


Epoch 1 elapsed: 1776.455s
	 [32mtrain_loss: 0.8183[0m


eval_pointwise: 100%|██████████| 2419/2419 [00:11<00:00, 216.63it/s]


	 eval rmse: 0.8515


train: 100%|██████████| 77401/77401 [29:28<00:00, 43.77it/s]


Epoch 2 elapsed: 1768.496s
	 [32mtrain_loss: 0.6878[0m


eval_pointwise: 100%|██████████| 2419/2419 [00:10<00:00, 221.53it/s]


	 eval rmse: 0.8311


train: 100%|██████████| 77401/77401 [29:31<00:00, 43.69it/s]


Epoch 3 elapsed: 1771.773s
	 [32mtrain_loss: 0.6416[0m


eval_pointwise: 100%|██████████| 2419/2419 [00:10<00:00, 221.57it/s]


	 eval rmse: 0.8259


Test the model

In [21]:
#Create a list of users and a list of items for testing
users = df_test['user'].tolist()
items = df_test['item'].tolist()

# Predict the rating for each user/item pair
predictions = ncf.predict(user=users, item=items)
df_test['prediction'] = predictions

[31mDetect 230 unknown interaction(s), position: [1329153, 517634, 528900, 535560, 472073, 713226, 170002, 1101844, 515606, 1075222, 70169, 291867, 601115, 72227, 526884, 130092, 423980, 630316, 1319469, 1282608, 10294, 4663, 1183289, 706106, 365116, 834622, 704576, 267842, 1269828, 243784, 1302606, 499792, 1351248, 257106, 313941, 407637, 695383, 20568, 526937, 596568, 1009751, 184930, 694371, 980069, 460390, 861288, 225897, 1048681, 414316, 1302637, 44657, 1401969, 175733, 451189, 394873, 821882, 804475, 160380, 598652, 1205887, 604300, 124560, 8337, 128657, 361104, 908946, 768150, 1257111, 1190040, 809116, 1391772, 210593, 7844, 897188, 219814, 1247399, 1321132, 509109, 217788, 1271998, 166592, 1157827, 1108676, 1242307, 499911, 428746, 1209547, 589005, 333009, 676562, 805074, 982739, 21717, 210133, 1099986, 1369815, 533215, 103648, 1350882, 262886, 923878, 447208, 1238768, 1182964, 494330, 727292, 1066750, 1102590, 147209, 1276682, 369933, 808717, 1159950, 1338125, 1011474, 515348

In [22]:
from sklearn.metrics import mean_squared_error

# Compute RMSE using sklearn
rmse = mean_squared_error(df_test['label'], df_test['prediction'])
print("RMSE:", rmse)

RMSE: 0.8567691445350647


In [23]:
#Save the model parameters
ncf.save(model_dir, model_name, data_info)

file folder drive/MyDrive/models doesn't exists, creating a new one...
