In [1]:
## Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os

In [3]:
#define variables here:
path_to_data = 'drive/MyDrive/netflix/' #change this to the path of your netflix competition data directory
model_dir = 'drive/MyDrive/Params/SVD/' #change this to the path of where you'd like to save model parameters

In [4]:
#Function to load all of the data into a pandas df
def load_netflix_data(file_paths):
    """Load Netflix data from multiple files into a single DataFrame."""
    data = {'user': [], 'item': [], 'label': []}
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            movie_id = None
            for line in file:
                line = line.strip()
                if line.endswith(':'):
                    movie_id = int(line[:-1])
                else:
                    customer_id, rating, _ = line.split(',')
                    data['item'].append(movie_id)
                    data['user'].append(int(customer_id))
                    data['label'].append(int(rating))
    return pd.DataFrame(data)

In [5]:
#load all of the netflix data into a pandas df
data_files = [f'{path_to_data}combined_data_{i}.txt' for i in range(1, 5)]
df = load_netflix_data(data_files)

In [6]:
#show some info about the df
print(len(df))
df.head()

100480507


Unnamed: 0,user,item,label
0,1488844,1,3
1,822109,1,5
2,885013,1,4
3,30878,1,4
4,823519,1,3


In [7]:
#Function to load the probe.txt file (which contains a list of user/movie pairs to be held off and used for testing), into a pandas df
def load_probe(file_path):
    """Load `probe.txt` into a DataFrame."""
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Ignore empty or non-data lines
            line = line.strip()
            if line.endswith(':'):  # Skip MovieID lines
                current_movie_id = int(line[:-1])
            else:
                # CustomerID, MovieID
                customer_id = int(line)
                data.append({'item': current_movie_id, 'user': customer_id})
    return pd.DataFrame(data)

In [8]:
#Load the list of test user/movie pairs
probe_path = f'{path_to_data}probe.txt'
probe_df = load_probe(probe_path)

In [9]:
#Join the list of test user/movie pairs to the dataframe to create a df that contains the test user/movie pairs with the ground truth rating info
df_test = probe_df.merge(df, on=['item', 'user'], how='inner')

In [10]:
#show some info about the test df
print(len(df_test))
df_test.head()

1408395


Unnamed: 0,item,user,label
0,1,30878,4
1,1,2647871,4
2,1,1283744,3
3,1,2488120,5
4,1,317050,5


In [11]:
# Perform an anti-join to create a df with the rows in df that are not in df_test
df_train = pd.merge(
    df,
    df_test,
    on=['user', 'item'],
    how='left',
    indicator=True
)

# Keep only rows that are present in df but not in df_test
df_train = df_train[df_train['_merge'] == 'left_only'].drop(columns='_merge')

#get rid of unnecessary column names that were added by the code above
df_train = df_train.rename(columns={'label_x': 'label'}).drop(columns=['label_y'], errors='ignore')

In [12]:
#show some basic info about the train df
print(len(df_train))
df_train.head()

99072112


Unnamed: 0,user,item,label
0,1488844,1,3
1,822109,1,5
2,885013,1,4
4,823519,1,3
5,893988,1,3


In [13]:
#install the surprise library which will be used for matrix factorization with SVD
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505166 sha256=518852a2cb369cc911092fab0ca97a480e9a1bb7cebe237916a030e1a5c60d54
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Instal

In [14]:
#Import Dataset and Reader from the surprise library, which will be used to turn the df's into surprise-ready data
from surprise import Dataset, Reader

#create a 'reader' for the data
reader = Reader(rating_scale=(1, 5))
#Build the dataset from the reader and the training dataframe
data = Dataset.load_from_df(df_train[['user', 'item', 'label']], reader)

In [15]:
#Import the surprise libraries that will be used for training and testing
from surprise.prediction_algorithms import SVD, KNNBasic, KNNBaseline

In [16]:
#build a trainset from the data
trainset = data.build_full_trainset()

#Create an SVD object with the specified hyperparameters(play around with these and see if you can find better hyperparams!)
svd = SVD(n_factors=15, reg_all=0.02)
#Factor the matrix with SVD
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7abe926967d0>

In [17]:
# Predict test set ratings using the SVD model
predictions = []
for _, row in df_test.iterrows():
    pred = svd.predict(row['user'], row['item'])
    predictions.append((pred.est, row['label']))

# Compute RMSE
from sklearn.metrics import mean_squared_error

# Extract predicted and actual ratings
y_pred, y_true = zip(*predictions)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f"RMSE on probe set: {rmse:.4f}")

RMSE on probe set: 0.9303


Save parameters

In [18]:
import pickle

# Save the model
with open(f'{model_dir}svd_model.pkl', 'wb') as f:
    pickle.dump(svd, f)

# Load the model
"""
with open(f'{model_dir}svd_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
"""

"\nwith open(f'{model_dir}svd_model.pkl', 'rb') as f:\n    loaded_model = pickle.load(f)\n"

In [19]:
#Save parameters
np.save(f'{model_dir}user_factors.npy', svd.pu)
np.save(f'{model_dir}item_factors.npy', svd.qi)
np.save(f'{model_dir}user_biases.npy', svd.bu)
np.save(f'{model_dir}item_biases.npy', svd.bi)

# Load parameters
"""
user_factors = np.load(f'{model_dir}user_factors.npy')
item_factors = np.load(f'{model_dir}item_factors.npy')
user_biases = np.load(f'{model_dir}user_biases.npy')
item_biases = np.load(f'{model_dir}item_biases.npy')
"""

"\nuser_factors = np.load(f'{model_dir}user_factors.npy')\nitem_factors = np.load(f'{model_dir}item_factors.npy')\nuser_biases = np.load(f'{model_dir}user_biases.npy')\nitem_biases = np.load(f'{model_dir}item_biases.npy')\n"