In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
import scipy.sparse as sp
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader


In [2]:
movies = pd.read_csv('movies.csv')
ratings_full = pd.read_csv('ratings.csv')
# display(movies)
# display(ratings)
ratings = ratings_full[(ratings_full['userId'] <= 10000)]
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)



In [3]:
## Get number of users and movies
display(ratings)
ratings['userId'] = pd.to_numeric(ratings['userId'], errors='coerce')
num_users = ratings['userId'].max()
num_movies = movies['movieId'].max()
total_movies = movies['movieId'].shape[0]
print("num_users:", num_users)
print("num_movies", num_movies)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
1496607,10000,117176,5.0,1539043478
1496608,10000,118696,2.5,1539035406
1496609,10000,145150,5.0,1539036026
1496610,10000,157296,2.0,1539034792


num_users: 10000
num_movies 209171


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['userId'] = pd.to_numeric(ratings['userId'], errors='coerce')


In [4]:
# try this autoencoder setup first 
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(num_movies, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 1024)
        self.fc4 = nn.Linear(1024, 512)
        self.fc5 = nn.Linear(512, 512)
        self.fc6 = nn.Linear(512, num_movies)
        self.activation = nn.SELU()
        self.dropout = nn.Dropout(0.8)
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.dropout(x)
        x = self.activation(self.fc4(x))
        x = self.activation(self.fc5(x))
        x = self.fc6(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.SGD(sae.parameters(), lr = 0.001, momentum=0.9)

In [5]:
nb_epoch = 18
# batch_size = 50
# train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
unique_user_ids = train_data['userId'].unique()
# print(unique_user_ids)
# Specify the batch size
batch_size = 128 # Adjust this based on your preference
# display(train_data)
# Initialize the DataLoader with the unique user IDs
user_loader = DataLoader(dataset=unique_user_ids, batch_size=batch_size, shuffle=True)
# create the dataset of 32 and then just dataloader the whole set?
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.
    for id_user in range(1, len(unique_user_ids), batch_size): 
        # Extract ratings for all users in mini-batch from ratings df 
        batch_user_ratings = train_data[(train_data['userId']>= id_user) & (train_data['userId'] < id_user + batch_size)] 
        # Transform df into matrix with users as rows and movieids as columns, with ratings as elements 
        batch_ratings_array = []
        batch_unique_user_ids = batch_user_ratings['userId'].unique()
        # print(batch_unique_user_ids)
        for userid in batch_unique_user_ids:  
            ratings_array = np.zeros(num_movies) # initialize zero array with number of movies length
            user_ratings = batch_user_ratings[(batch_user_ratings['userId'] == userid)] # get this user's df section only
            movie_ids = user_ratings['movieId'].values # get all the movies rated by this user 
            user_ratings_array = user_ratings['rating'].values # get all the ratings for those movies 
            ratings_array[movie_ids - 1] = user_ratings_array # place ratings in corresponding movie's index 
            # print("ratings_array: ", ratings_array)

            # non_zero_mean = np.mean(ratings_array[ratings_array != 0])
            # # Subtract mean from non-zero elements
            # ratings_array[ratings_array != 0] -= non_zero_mean
            # print("normalized: ", ratings_array)
            batch_ratings_array.append(ratings_array)
        # print(batch_ratings_array)
        user_loader = DataLoader(dataset=batch_ratings_array, batch_size=batch_size, shuffle=True)
        # user_ratings = ratings.loc[ratings['userId'] == id_user] # df for this specific user 
        # Create an array with zeros for all movies
        i = 0
        for batch in user_loader: 
        # input = torch.tensor(ratings_array, dtype=torch.float)
        # target = input.clone()
            # print(i)
            i+=1
            input = torch.tensor(batch, dtype=torch.float)
            target = input.clone()
            if torch.sum(target.data > 0) > 0:
                output = sae(input)
                target.require_grad = False
                output[target == 0] = 0 # mask
                loss = criterion(output, target)
                # print(loss)
                #  adjust the loss based on the density of the target values. 
                # If there are more zero values (sparse data), scale the loss to account for this sparsity.
                mean_corrector = total_movies/float(torch.sum(target.data > 0) + 1e-10)
                loss.backward()
                train_loss += np.sqrt(loss.item()*mean_corrector)
                # print("train_loss: ", train_loss)
                s += 1.
                # print("s: ", s)
                # sae.fc4.weight.data = sae.fc1.weight.data.t().clone()
                # sae.fc3.weight.data = sae.fc2.weight.data.t().clone()

                optimizer.step()
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))

  input = torch.tensor(batch, dtype=torch.float)


epoch: 1 loss: 0.18264053018676424
epoch: 2 loss: 0.1822651043455633
epoch: 3 loss: 0.18168078380021177
epoch: 4 loss: 0.1809010857604327
epoch: 5 loss: 0.17958289038883324
epoch: 6 loss: 0.17721707305918702
epoch: 7 loss: 0.17293273720090768
epoch: 8 loss: 0.1653858336248635
epoch: 9 loss: 0.15347730783012212
epoch: 10 loss: 0.1400416936687967
epoch: 11 loss: 0.13686165614553858
epoch: 12 loss: 0.1325020398697864
epoch: 13 loss: 0.11121897541559107
epoch: 14 loss: 0.10917686787532886
epoch: 15 loss: 0.1114080721665828
epoch: 16 loss: 0.10516337917747083
epoch: 17 loss: 0.09710179696455688
epoch: 18 loss: 0.09601502797578128


In [9]:
import pickle
filename = 'trained_model.sav'
pickle.dump(sae,open(filename, 'wb'))
loaded_sae = pickle.load(open(filename, 'rb'))


In [12]:
test_loss = 0
s = 0.
# need to use train_data in above too 
# display(test_data)
unique_test_user_ids = test_data['userId'].unique()
rms_avg = []
counter = 0
sae.eval()
for id_user in unique_test_user_ids:
    test_ratings_array = np.zeros(num_movies) # initialize zero array with number of movies length
    test_user_ratings = test_data[(test_data['userId'] == id_user)] # get this user's df section only
    test_movie_ids = test_user_ratings['movieId'].values # get all the movies rated by this user 
    test_user_ratings_array = test_user_ratings['rating'].values # get all the ratings for those movies 
    test_ratings_array[test_movie_ids - 1] = test_user_ratings_array # place rating
    nonzero_indices = np.nonzero(test_ratings_array)
    # print("i count is: ", counter)
    counter+=1

    # print("nonzero indices: ", nonzero_indices)
    zeroed_ratings = []
    zeroed_ratings_indices = []
    for i in range(0, len(nonzero_indices[0]), 8): 
        zeroed_ratings.append(test_ratings_array[nonzero_indices[0][i]])
        zeroed_ratings_indices.append(nonzero_indices[0][i])
        test_ratings_array[nonzero_indices[0][i]] = 0
    # print("zeroed ratings indices", zeroed_ratings_indices)

    # print("zeroed ratings", zeroed_ratings)
    input = torch.tensor(test_ratings_array, dtype=torch.float)
    # print(input)
    target = input.clone()
    if torch.sum(target.data > 0) > 0:
        input.require_grad = False 
        output = sae(input)
        target.require_grad = False
        output_np = output.detach().numpy()
        predicted_ratings = output_np[zeroed_ratings_indices]
        predicted_capped_ratings = torch.clamp(torch.from_numpy(predicted_ratings), min=1, max=5)
        predicted_capped_ratings_np = predicted_capped_ratings.numpy()
        # print("predicted ratings: ", predicted_capped_ratings)
        rms = np.sqrt(np.sum((predicted_ratings - zeroed_ratings)**2)/len(predicted_ratings))
        # print("rms: ", rms)
        rms_avg.append(rms)
        # output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = total_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.item()*mean_corrector)
        s += 1.
    if counter == 50: 
        # print("in here")
        print("rms_avg: ", np.mean(rms_avg))
        counter = 0
        rms_avg = []
# print('test loss: '+str(test_loss/s))
print("i count: ", counter)
print("final avg rms: ", np.mean(rms))

rms_avg:  1.7112779033636827
rms_avg:  1.9375000917571854
rms_avg:  1.9494997627058108
rms_avg:  1.8918221925898926
rms_avg:  1.9480376375327375
rms_avg:  1.814888585646851
rms_avg:  1.9340226011454542
rms_avg:  1.8730611668334298
rms_avg:  1.8311533777787121
rms_avg:  1.7911163127693954
rms_avg:  1.8825558623030885
rms_avg:  1.9903816031045514
rms_avg:  1.8944121067091002
rms_avg:  1.7190404137391422
rms_avg:  1.9416817506153397
rms_avg:  1.8195304710849192
rms_avg:  1.7765829071851926
rms_avg:  1.6512702771993388
rms_avg:  1.7300126067935122
rms_avg:  1.8854929254388892
rms_avg:  1.9037463854661127
rms_avg:  1.7573743800068466
rms_avg:  1.7518519986732217
rms_avg:  1.8231412462294574
rms_avg:  1.687636913540427
rms_avg:  1.5731737438553384
rms_avg:  1.7511528555825635
rms_avg:  1.8021028061665836
rms_avg:  1.5586577505042698
rms_avg:  1.692544317296755
rms_avg:  1.6767507612044668
rms_avg:  1.6574508355639166
rms_avg:  1.8224929290713874
rms_avg:  1.8380648645357305
rms_avg:  1.68114