**Config**

In [None]:
# Uses only the first 20000 books
use_shorter_dataset = False

# Number of epochs to train the autoencoder for - around 30 seems to be ideal for poetry
# Some experimentation might be needed with other datasets, see a plot of the training below
epochs = 75

# Specify the genre to use - this is used to construct the filenames
genre = "kaggle"

# Size of the embedding outputted by the encoder (string part of the embedding)
# The int part is concatenated to this embedding, so the final size will be
# string_embedding_size + 5
string_embedding_size = 251

embedding_field_name = "embedding"

**String embeddings**

In [None]:
import os
import time
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder

import torch
import torch.utils.data as data
import torch.optim as optim
import torch.nn as nn

from google.colab import drive

Load data

In [None]:
drive.mount("/content/drive", force_remount=True)
drive_data_path = "/content/drive/Shareddrives/KNN-Recommenders/data/"
data_string = pd.read_csv(drive_data_path + "book_metadata_string_" + genre + ".csv")
if use_shorter_dataset:
  data_string = data_string[0:20000]
data_string.head()

Create the tfidf vectorizer and extract data for 10000 most used words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 1), min_df=0.0001, stop_words='english', max_features=10000)
tfidf_matrix = tfidf.fit_transform(data_string[embedding_field_name])
tfidf_data = pd.DataFrame(tfidf_matrix.toarray(), index=data_string.index.tolist())

print(tfidf_data.shape)
tfidf_data.head(3)

Autoencoder. Source: https://github.com/alineberry/my-movie-recommender

In [None]:
import torch
import numpy as np
import pandas as pd
import pickle
import multiprocessing
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


def get_cv_idxs(n, cv_idx=0, val_pct=0.2, seed=42):
    """ Get a list of index values for Validation set from a dataset
    Arguments:
        n : int, Total number of elements in the data set.
        cv_idx : int, starting index [idx_start = cv_idx*int(val_pct*n)]
        val_pct : (int, float), validation set percentage
        seed : seed value for RandomState
    Returns:
        list of indexes
    """
    np.random.seed(seed)
    n_val = int(val_pct * n)
    idx_start = cv_idx * n_val
    idxs = np.random.permutation(n)
    return idxs[idx_start:idx_start + n_val]


def split_by_idx(idxs, *a):
    """
    Split each array passed as *a, to a pair of arrays like this (elements selected by idxs,  the remaining elements)
    This can be used to split multiple arrays containing training data to validation and training set.
    :param idxs [int]: list of indexes selected
    :param a list: list of np.array, each array should have same amount of elements in the first dimension
    :return: list of tuples, each containing a split of corresponding array from *a.
            First element of each tuple is an array composed from elements selected by idxs,
            second element is an array of remaining elements.
    """
    mask = np.zeros(len(a[0]), dtype=bool)
    mask[np.array(idxs)] = True
    return [(o[mask], o[~mask]) for o in a]


class AutoEncoder(object):

    def __init__(self, data, validation_perc=0.2, lr=0.001,
                 intermediate_size=1000, encoded_size=100):

        # create training dataloader and validation tensor
        self.data = data
        self.val_idxs = get_cv_idxs(n=data.shape[0], val_pct=validation_perc)
        [(self.val, self.train)] = split_by_idx(self.val_idxs, data)
        self.dataset = AETrainingData(self.train)
        self.dataloader = DataLoader(self.dataset, batch_size=64, shuffle=True,
                                     num_workers=multiprocessing.cpu_count())
        self.val = torch.from_numpy(self.val.values).\
            type(torch.FloatTensor).cuda()

        # instantiate the encoder and decoder nets
        size = data.shape[1]
        self.encoder = Encoder(size, intermediate_size, encoded_size).cuda()
        self.decoder = Decoder(size, intermediate_size, encoded_size).cuda()

        # instantiate the optimizers
        self.encoder_optimizer = optim.Adam(
            self.encoder.parameters(), lr=lr, weight_decay=1e-8)
        self.decoder_optimizer = optim.Adam(
            self.decoder.parameters(), lr=lr, weight_decay=1e-8)

        # instantiate the loss criterion
        self.criterion = nn.MSELoss(reduction='mean')

        self.train_losses = []
        self.val_losses = []

    def train_step(self, input_tensor, target_tensor):
        # clear the gradients in the optimizers
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        # Forward pass through
        encoded_representation = self.encoder(input_tensor)
        reconstruction = self.decoder(encoded_representation)

        # Compute the loss
        loss = self.criterion(reconstruction, target_tensor)

        # Compute the gradients
        loss.backward()

        # Step the optimizers to update the model weights
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()

        # Return the loss value to track training progress
        return loss.item()
    
    def reset(self, train=True):
        # due to dropout the network behaves differently in training and
        # evaluation modes
        if train: self.encoder.train(); self.decoder.train()
        else: self.encoder.eval(); self.decoder.eval()

    def get_val_loss(self, input_tensor, target_tensor):
        self.reset(train=False)
        encoded = self.encoder(input_tensor)
        decoded = self.decoder(encoded)
        loss = self.criterion(decoded, target_tensor)
        return loss.item()

    def train_loop(self, epochs, print_every_n_batches=100):

        # Cycle through epochs
        for epoch in range(epochs):
            print(f'Epoch {epoch + 1}/{epochs}')

            # Cycle through batches
            for i, batch in enumerate(self.dataloader):
                
                self.reset(train=True)

                input_tensor = batch['input'].cuda()
                target_tensor = batch['target'].cuda()

                loss = self.train_step(input_tensor, target_tensor)

                if i % print_every_n_batches == 0 and i != 0:
                    val_loss = self.get_val_loss(self.val, self.val)
                    print(f'train loss: {round(loss, 8)} | ' +
                          f'validation loss: {round(val_loss, 8)})')
                    self.train_losses.append(loss)
                    self.val_losses.append(val_loss)

    def get_encoded_representations(self):
        to_encode = torch.from_numpy(self.data.values).type(
            torch.FloatTensor).cuda()
        self.reset(train=False)
        encodings = self.encoder(to_encode).cpu().data.numpy()
        return encodings


class AETrainingData(Dataset):
    """
    Format the training dataset to be input into the auto encoder.
    Takes in dataframe and converts it to a PyTorch Tensor
    """

    def __init__(self, x_train):
        self.x = x_train

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        """
        Returns a example from the data set as a pytorch tensor.
        """
        # Get example/target pair at idx as numpy arrays
        x, y = self.x.iloc[idx].values, self.x.iloc[idx].values

        # Convert to torch tensor
        x = torch.from_numpy(x).type(torch.FloatTensor)
        y = torch.from_numpy(y).type(torch.FloatTensor)

        # Return pair
        return {'input': x, 'target': y}


class Encoder(nn.Module):
    def __init__(self, input_size, intermediate_size, encoding_size):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, intermediate_size),
            nn.BatchNorm1d(intermediate_size),
            nn.ReLU(True),
            nn.Dropout(0.2),
            nn.Linear(intermediate_size, encoding_size),
            nn.BatchNorm1d(encoding_size),
            nn.ReLU(True),
            nn.Dropout(0.2))

    def forward(self, x):
        x = self.encoder(x)
        return x


class Decoder(nn.Module):
    def __init__(self, output_size, intermediate_size, encoding_size):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.Linear(encoding_size, intermediate_size),
            nn.BatchNorm1d(intermediate_size),
            nn.ReLU(True),
            nn.Dropout(0.2),
            nn.Linear(intermediate_size, output_size),
            nn.BatchNorm1d(output_size),
            nn.Sigmoid())

    def forward(self, x):
        x = self.decoder(x)
        return x

Train autoencoder and plot a graph. Change "encoded_size" to specify a different size of the final encoding

In [None]:
ae = AutoEncoder(tfidf_data, validation_perc=0.1, lr=1e-3, intermediate_size=5000, encoded_size=string_embedding_size)

In [None]:
ae.train_loop(epochs=epochs)
losses = pd.DataFrame(data=list(zip(ae.train_losses, ae.val_losses)), columns=['train_loss', 'validation_loss'])
losses['epoch'] = (losses.index + 1) / 3
fig, ax = plt.subplots()
ax.plot(losses['epoch'], losses['train_loss'])
ax.plot(losses['epoch'], losses['validation_loss'])
ax.set_ylabel('MSE loss')
ax.set_xlabel('epoch')
ax.set_title('autoencoder loss over time')
ax.legend()

In [None]:
encoded = ae.get_encoded_representations()
print(encoded.shape)
encoded

Code for checking similarity between books. Good for checking the embeddings give sense. Source: https://github.com/alineberry/my-movie-recommender

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import pandas as pd

class SimilarityPredictions(object):
    '''This class calculates a similarity matrix from latent embeddings.
    There is a method to save this similarity model locally, and a method for
    predicting similar items from the matrix.
    Input: embeddings - a pandas dataframe of items and latent dimensions.
            similarity_metric = str definining the similarity metrics to use'''

    def __init__(self, embeddings, similarity_metric='cosine'):
        assert similarity_metric in ['cosine', 'euclidean'], "unsupported similarity metric."
        self.embeddings = embeddings
        self.ids = embeddings.index.tolist()
        self.similarity_metric = similarity_metric
        if similarity_metric == 'cosine':
            self.similarity_matrix = self.calculate_cosine_similarity_matrix()
        if similarity_metric == 'euclidean':
            self.similarity_matrix = self.calculate_euclidean_distances_matrix()

    def calculate_cosine_similarity_matrix(self):
        '''Calculates a cosine similarity matrix from the embeddings'''
        similarity_matrix = pd.DataFrame(cosine_similarity(
            X=self.embeddings),
            index=self.ids)
        similarity_matrix.columns = self.ids
        return similarity_matrix

    def calculate_euclidean_distances_matrix(self):
        '''Calculates a cosine similarity matrix from the embeddings'''
        similarity_matrix= pd.DataFrame(euclidean_distances(
            X=self.embeddings),
            index=self.ids)
        similarity_matrix.columns = self.ids
        return similarity_matrix

    def predict_similar_items(self, seed_item, n):
        '''Use the similarity_matrix to return n most similar items.'''
        similar_items = pd.DataFrame(self.similarity_matrix.loc[seed_item])
        similar_items.columns = ["similarity_score"]
        if self.similarity_metric == 'cosine':
            similar_items = similar_items.sort_values('similarity_score', ascending=False)
        if self.similarity_metric == 'euclidean':
            similar_items = similar_items.sort_values('similarity_score', ascending=True)
        similar_items = similar_items.head(n)
        similar_items.reset_index(inplace=True)
        similar_items = similar_items.rename(index=str, columns={"index": "item_id"})
        return similar_items.to_dict()

This code can be used to check similar books
It consumes RAM, so it's commented for now

In [None]:
#content_embeddings = pd.DataFrame(encoded)
#sim_model_cont = SimilarityPredictions(content_embeddings, similarity_metric="cosine")
#cont_output = sim_model_cont.predict_similar_items(seed_item=4141, n=26744)
#similar_movies = pd.DataFrame(cont_output)
#similar_movies.set_index('item_id', inplace=True)
#sim_df_cont = similar_movies
#sim_df_cont.sort_values('similarity_score', ascending=False, inplace=True)
#sim_df_cont = sim_df_cont.rename(index=str, columns={"similarity_score": "content_similarity_score"})
#sim_df_cont.head()

In [None]:
#print(data_string["aggregated_string"][16146])

Drop the text data from the data_string data frame and create a new dataframe containing the generated string embedding

In [None]:
data_string.drop([embedding_field_name], axis=1, inplace=True)

In [None]:
data_string_new = pd.concat([data_string, pd.DataFrame(encoded)], axis=1)
data_string_new

**Numbers**

Read the file containing numeric book data

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
data_int = pd.read_csv(drive_data_path + "book_metadata_int_" + genre + ".csv")
if use_shorter_dataset:
  data_int = data_int[0:20000]
num_books = data_int["book_id"].unique().shape[0]
data_int.head()

Rename the columns so it can be merged with string embeddings

In [None]:
hight, width = data_string_new.shape
col_names = {"ratings_count": width - 1, "num_pages": width, "average_rating": width + 1, "text_reviews_count": width + 2}
data_int.rename(columns = col_names, inplace = True)
data_int

Normalize the int data, so that the values are inside <string_embedding_min; string_embedding_max>

In [None]:
def normalize(value, min, max, new_max):
    return ((value - min) / (max - min)) * new_max

In [None]:
maxes = list()
for i in range(0, string_embedding_size):
  maxes.append(data_string_new[i].max())
data_string_max = max(maxes)

for i in col_names.values():
  data_int[i] = normalize(data_int[i], data_int[i].min(), data_int[i].max(), data_string_max)
data_int

Merge the created int embedding into string embedding data frame created previously

In [None]:
merged_df = data_string_new.merge(data_int, left_on="book_id", right_on="book_id")

Add normalized ids at the end

In [None]:
merged_df[width + 3] = normalize(merged_df["book_id"], merged_df["book_id"].min(), merged_df["book_id"].max(), data_string_max)
merged_df.head()

Save the embeddings

In [None]:
merged_df.to_csv(drive_data_path + "book_embedding_" + genre + "_" + str(string_embedding_size + 5) + ".csv", index=False)