# Auto Encoders

In [1]:
from os.path import dirname, abspath, join, curdir

import numpy as np
import pandas as pd

from torch.nn import MSELoss
from torch.optim import RMSprop
from torch import FloatTensor, mean, abs

In [2]:
# Import the dataset
datapath = join(dirname(dirname(abspath(curdir))), "data", "raw", "rbm")

movies = pd.read_csv(join(datapath, "movielens-1m", "movies.dat"),
                     sep="::",
                     header=None,
                     engine="python",
                     encoding="latin-1")

users = pd.read_csv(join(datapath, "movielens-1m", "users.dat"),
                    sep="::",
                    header=None,
                    engine="python",
                    encoding="latin-1")

ratings = pd.read_csv(join(datapath, "movielens-1m", "ratings.dat"),
                      sep="::",
                      header=None,
                      engine="python",
                      encoding="latin-1")

movies.shape, users.shape, ratings.shape

((3883, 3), (6040, 5), (1000209, 4))

In [3]:
# Prepare training and test sets
train_df = pd.read_csv(join(datapath, "movielens-100k", "u1.base"),
                        sep="\t",
                        header=None)

train_set = np.array(train_df, dtype="int")

test_df = pd.read_csv(join(datapath, "movielens-100k", "u1.test"),
                        sep="\t",
                        header=None)

test_set = np.array(test_df, dtype="int")

train_set.shape, test_set.shape

((80000, 4), (20000, 4))

In [4]:
# Create matrices of total number of users and movies for bi-fold cross validation
# The max user/movie ID may be present in the training or test data
nb_users = int(max(max(train_set[:, 0]), max(test_set[:, 0])))
nb_movies = int(max(max(train_set[:, 1]), max(test_set[:, 1])))

nb_users, nb_movies

(943, 1682)

In [5]:
def convert(data: np.ndarray) -> list:
    """Convert data into a matrix like structure.

    Args:
    ----
    data : np.ndarray
        The data to transform
    size : int
        The total number of items in the overall dataset

    Returns:
    -------
    list
        The data transformed
    """
    new_data = []

    for user_id in range(1, nb_users + 1):
        # Get user movies and ratings
        movie_ids = data[:, 1][data[:, 0] == user_id]
        rating_ids = data[:, 2][data[:, 0] == user_id]

        # Get all list of movie ratings by user, unrated movies = -1
        ratings = -np.ones(nb_movies)
        ratings[movie_ids - 1] = rating_ids # movie_ids starts at 1
        new_data.append(list(ratings))

    return new_data

In [6]:
train_set_converted = convert(train_set)
test_set_converted = convert(test_set)

train_set_converted[:5]

[[5.0,
  3.0,
  4.0,
  3.0,
  3.0,
  -1.0,
  4.0,
  1.0,
  5.0,
  -1.0,
  2.0,
  -1.0,
  5.0,
  -1.0,
  5.0,
  5.0,
  -1.0,
  4.0,
  5.0,
  -1.0,
  1.0,
  4.0,
  -1.0,
  -1.0,
  4.0,
  3.0,
  -1.0,
  4.0,
  1.0,
  3.0,
  -1.0,
  5.0,
  -1.0,
  2.0,
  1.0,
  -1.0,
  2.0,
  3.0,
  -1.0,
  3.0,
  2.0,
  5.0,
  4.0,
  -1.0,
  5.0,
  4.0,
  -1.0,
  5.0,
  -1.0,
  5.0,
  -1.0,
  4.0,
  -1.0,
  -1.0,
  5.0,
  -1.0,
  5.0,
  4.0,
  5.0,
  -1.0,
  -1.0,
  -1.0,
  2.0,
  -1.0,
  -1.0,
  4.0,
  -1.0,
  4.0,
  -1.0,
  -1.0,
  3.0,
  -1.0,
  -1.0,
  -1.0,
  4.0,
  -1.0,
  4.0,
  -1.0,
  4.0,
  -1.0,
  -1.0,
  -1.0,
  3.0,
  -1.0,
  -1.0,
  -1.0,
  5.0,
  4.0,
  5.0,
  -1.0,
  -1.0,
  -1.0,
  5.0,
  2.0,
  4.0,
  -1.0,
  -1.0,
  -1.0,
  3.0,
  -1.0,
  2.0,
  -1.0,
  -1.0,
  -1.0,
  2.0,
  4.0,
  -1.0,
  -1.0,
  5.0,
  1.0,
  5.0,
  -1.0,
  -1.0,
  -1.0,
  5.0,
  3.0,
  -1.0,
  -1.0,
  5.0,
  -1.0,
  -1.0,
  3.0,
  4.0,
  5.0,
  -1.0,
  2.0,
  5.0,
  -1.0,
  -1.0,
  -1.0,
  1.0,
  -1.0,
  4.0,
  -1.0

In [7]:
test_set_converted[:5]

[[-1.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  5.0,
  -1.0,
  -1.0,
  -1.0,
  3.0,
  -1.0,
  5.0,
  -1.0,
  5.0,
  -1.0,
  -1.0,
  3.0,
  -1.0,
  -1.0,
  4.0,
  -1.0,
  -1.0,
  4.0,
  3.0,
  -1.0,
  -1.0,
  2.0,
  -1.0,
  -1.0,
  -1.0,
  3.0,
  -1.0,
  4.0,
  -1.0,
  -1.0,
  2.0,
  -1.0,
  -1.0,
  4.0,
  -1.0,
  -1.0,
  -1.0,
  -1.0,
  5.0,
  -1.0,
  -1.0,
  4.0,
  -1.0,
  3.0,
  -1.0,
  4.0,
  -1.0,
  3.0,
  3.0,
  -1.0,
  4.0,
  -1.0,
  -1.0,
  -1.0,
  5.0,
  4.0,
  3.0,
  -1.0,
  5.0,
  4.0,
  -1.0,
  3.0,
  -1.0,
  3.0,
  3.0,
  -1.0,
  4.0,
  3.0,
  1.0,
  -1.0,
  4.0,
  -1.0,
  1.0,
  -1.0,
  4.0,
  5.0,
  5.0,
  -1.0,
  4.0,
  3.0,
  5.0,
  -1.0,
  -1.0,
  -1.0,
  4.0,
  5.0,
  3.0,
  -1.0,
  -1.0,
  -1.0,
  5.0,
  3.0,
  4.0,
  -1.0,
  5.0,
  -1.0,
  2.0,
  1.0,
  1.0,
  -1.0,
  -1.0,
  4.0,
  5.0,
  -1.0,
  -1.0,
  -1.0,
  1.0,
  5.0,
  5.0,
  -1.0,
  -1.0,
  3.0,
  3.0,
  -1.0,
  1.0,
  4.0,
  -1.0,
  -1.0,
  -1.0,
  3.0,
  -1.0,
  -1.0,
  4.0,
  5.0,
  3.0,
  -1.0,
  4.0,
  -1.0,

In [8]:
# Convert data into tensors
train_set_ft = FloatTensor(train_set_converted)
test_set_ft = FloatTensor(test_set_converted)

In [9]:
# Initialize stacked AutoEncoder instance
from ae import SAE

sae = SAE(nb_movies)

In [None]:
# Define criteria for loss function
criterion = MSELoss()

# Initialize optimizer
optimizer = RMSprop(SAE.parameters(), lr=0.01, weight_decay=0.5)