# Auto Encoders

In [1]:
from os.path import dirname, abspath, join, curdir

import numpy as np
import pandas as pd
import torch

from torch.nn import MSELoss
from torch.optim import RMSprop
from torch.autograd import Variable
from torch import FloatTensor

In [2]:
# Import the dataset
datapath = join(dirname(dirname(abspath(curdir))), "data", "raw", "rbm")

movies = pd.read_csv(join(datapath, "movielens-1m", "movies.dat"),
                     sep="::",
                     header=None,
                     engine="python",
                     encoding="latin-1")

users = pd.read_csv(join(datapath, "movielens-1m", "users.dat"),
                    sep="::",
                    header=None,
                    engine="python",
                    encoding="latin-1")

ratings = pd.read_csv(join(datapath, "movielens-1m", "ratings.dat"),
                      sep="::",
                      header=None,
                      engine="python",
                      encoding="latin-1")

movies.shape, users.shape, ratings.shape

((3883, 3), (6040, 5), (1000209, 4))

In [3]:
# Prepare training and test sets
train_df = pd.read_csv(join(datapath, "movielens-100k", "u1.base"),
                        delimiter="\t",
                        header=None)

train_set = np.array(train_df, dtype="int")

test_df = pd.read_csv(join(datapath, "movielens-100k", "u1.test"),
                        delimiter="\t",
                        header=None)

test_set = np.array(test_df, dtype="int")

train_set.shape, test_set.shape

((80000, 4), (20000, 4))

In [4]:
# Create matrices of total number of users and movies for bi-fold cross validation
# The max user/movie ID may be present in the training or test data
nb_users = int(max(max(train_set[:, 0]), max(test_set[:, 0])))
nb_movies = int(max(max(train_set[:, 1]), max(test_set[:, 1])))

nb_users, nb_movies

(943, 1682)

In [5]:
def convert(data: np.ndarray) -> list:
    """Convert data into a matrix like structure.

    Args:
    ----
    data : np.ndarray
        The data to transform
    size : int
        The total number of items in the overall dataset

    Returns:
    -------
    list
        The data transformed
    """
    new_data = []

    for user_id in range(1, nb_users + 1):
        # Get user movies and ratings
        movie_ids = data[:, 1][data[:, 0] == user_id]
        rating_ids = data[:, 2][data[:, 0] == user_id]

        # Get all list of movie ratings by user, unrated movies = 0
        ratings = np.zeros(nb_movies)
        ratings[movie_ids - 1] = rating_ids # movie_ids starts at 1
        new_data.append(list(ratings))

    return new_data

In [6]:
train_set_converted = convert(train_set)
test_set_converted = convert(test_set)

In [7]:
# Convert data into tensors
train_set_ft = FloatTensor(train_set_converted)
test_set_ft = FloatTensor(test_set_converted)

In [15]:
train_set_ft.shape

torch.Size([943, 1682])

In [14]:
test_set_ft.shape

torch.Size([943, 1682])

In [9]:
# Initialize stacked AutoEncoder instance
from ae import SAE

sae = SAE(nb_movies)

In [10]:
# Define criteria for loss function
criterion = MSELoss()

# Initialize optimizer
optimizer = RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5)

In [11]:
# Train the SAE model
epochs = 200
bias = 1e-10

for epoch in range(epochs):
    train_loss = 0
    s = 0.

    for uid in range(nb_users):
        # Get user ratings
        input_vect = Variable(train_set_ft[uid]).unsqueeze(0)
        target_vect = input_vect.clone()

        # Skip any user with zero ratings
        if torch.sum(target_vect.data > 0) > 0:
            # Get vector of predicted ratings
            output_vect = sae(input_vect)

            # Don't compute gradient with respect to the targets (memory saver)
            target_vect.requires_grad = False

            # Freeze features where there was no rating by the user
            output_vect[target_vect == 0] = 0

            # Compute loss
            loss = criterion(output_vect, target_vect)

            # Compute average of error where there was a rating
            mean_corrector = nb_movies/float(torch.sum(target_vect.data > 0) + bias)

            # Get direction of gradient to update weights
            loss.backward()

            # Compute RMSE and update train_loss
            train_loss += np.sqrt(loss.data * mean_corrector)
            s += 1.

            # Get intensity of weight updates
            optimizer.step()
    
    print(f"Epoch: {epoch}, Train_Loss: {train_loss/s}")

Epoch: 0, Train_Loss: 1.771719217300415
Epoch: 1, Train_Loss: 1.0967650413513184
Epoch: 2, Train_Loss: 1.053282380104065
Epoch: 3, Train_Loss: 1.0383638143539429
Epoch: 4, Train_Loss: 1.0307718515396118
Epoch: 5, Train_Loss: 1.026605248451233
Epoch: 6, Train_Loss: 1.0237427949905396
Epoch: 7, Train_Loss: 1.0219579935073853
Epoch: 8, Train_Loss: 1.0207545757293701
Epoch: 9, Train_Loss: 1.0195647478103638
Epoch: 10, Train_Loss: 1.0189452171325684
Epoch: 11, Train_Loss: 1.0184614658355713
Epoch: 12, Train_Loss: 1.01790452003479
Epoch: 13, Train_Loss: 1.017511248588562
Epoch: 14, Train_Loss: 1.0172500610351562
Epoch: 15, Train_Loss: 1.0167913436889648
Epoch: 16, Train_Loss: 1.0165694952011108
Epoch: 17, Train_Loss: 1.016481637954712
Epoch: 18, Train_Loss: 1.0163096189498901
Epoch: 19, Train_Loss: 1.0160201787948608
Epoch: 20, Train_Loss: 1.0161055326461792
Epoch: 21, Train_Loss: 1.0157771110534668
Epoch: 22, Train_Loss: 1.0157653093338013
Epoch: 23, Train_Loss: 1.0157520771026611
Epoch: 24

In [16]:
# Evaluate the model using the test set
test_loss = 0
s = 0.

for uid in range(nb_users):
    input_vect = Variable(train_set_ft[uid]).unsqueeze(0)
    target_vect = Variable(test_set_ft[uid]).unsqueeze(0)

    if torch.sum(target_vect.data > 0) > 0:
        output_vect = sae(input_vect)
        target_vect.requires_grad = False
        output_vect[target_vect == 0] = 0
        loss = criterion(output_vect, target_vect)
        mean_corrector = nb_movies/float(torch.sum(target_vect.data > 0) + bias)
        test_loss += np.sqrt(loss.data * mean_corrector)
        s += 1.

print(f"Test_Loss: {test_loss/s}")

Test_Loss: 0.9517191052436829
