# Import ratings data


In [1]:
import pandas as pd
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

raw_data = pd.read_csv("./data/Movielens100/u.data", sep = None, names=["userId", "movieId", "rating", "timestamp"])
raw_data = raw_data.loc[:, raw_data.columns != "timestamp"]
#make indices start at 0
raw_data["userId"] -= 1
raw_data["movieId"] -= 1
#make ratings center around 0
raw_data["rating"] -= 3

# create (943, 1682) matrix of user ratings per movie
user_ratings = pd.DataFrame(np.zeros((943,1682)))
for i in raw_data.index:
    user_ratings[raw_data["movieId"][i]][raw_data["userId"][i]] = raw_data["rating"][i]
user_ratings = user_ratings.to_numpy() 
user_ratings

  raw_data = pd.read_csv("./data/Movielens100/u.data", sep = None, names=["userId", "movieId", "rating", "timestamp"])


array([[2., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [2., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.]])

# Import user information matrix I made

First column is normalized age, second column is sex, the rest are the average ratings per genre. Genres are listed in u.genre file

In [2]:
user_info = pd.read_csv("./data/user_info.csv", index_col=0)
user_info

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-0.824422,1.0,-0.528577,0.163282,-0.643886,-0.442094,-1.133953,0.682175,-0.240302,-0.297957,...,-0.528577,-0.499749,-0.384440,-0.586232,-0.470922,0.624520,0.682175,0.365074,-0.067338,-0.442094
1,1.554043,0.0,-0.557404,0.549569,-0.003917,-0.419032,-0.557404,1.241427,0.411197,-0.557404,...,-0.557404,-0.142289,-0.557404,-0.557404,-0.280661,1.933286,-0.142289,0.411197,-0.280661,-0.557404
2,-0.906438,1.0,-0.557404,-1.036383,-0.238085,-0.557404,-0.557404,-1.355702,-0.557404,-0.238085,...,-0.557404,-0.717064,-1.036383,-0.876723,-0.238085,-0.238085,-0.876723,-2.154000,-0.717064,-0.557404
3,-0.824422,1.0,-0.557404,1.621949,0.065268,-0.557404,-0.557404,1.933286,1.621949,0.065268,...,-0.557404,-0.557404,-0.246068,0.065268,0.999277,0.687941,0.999277,2.555958,0.376605,-0.557404
4,-0.086278,0.0,-0.506365,-0.149094,-0.149094,0.004022,-1.374024,-0.557404,-0.149094,-0.557404,...,-0.608443,-0.455327,-1.220908,-0.353249,-0.557404,-1.220908,0.310254,-0.608443,-0.404288,-0.608443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,-0.660390,0.0,-0.557404,2.193939,0.166634,-0.412597,-0.412597,2.773170,0.021826,-0.557404,...,-0.412597,-0.557404,-0.557404,-0.412597,-0.412597,2.049132,0.745864,1.325094,0.311441,-0.557404
939,-0.168294,1.0,-0.557404,0.284045,-0.557404,-0.304969,-0.052535,1.293784,0.199900,-0.557404,...,-0.557404,-0.304969,-0.557404,-0.473259,-0.220824,0.536480,-0.473259,0.031610,-0.220824,-0.557404
940,-1.152486,1.0,-0.557404,1.933286,1.310613,0.999277,0.376605,1.933286,-0.557404,-0.557404,...,-0.557404,-0.557404,-0.557404,-0.246068,0.065268,0.065268,1.621949,1.621949,0.065268,-0.557404
941,1.143963,0.0,-0.557404,1.301320,1.208383,0.093149,1.022511,1.673064,-0.557404,-0.557404,...,-0.371532,-0.371532,-0.371532,0.093149,0.279022,1.673064,0.093149,1.301320,1.022511,-0.092723


# Create training and testing data


In [3]:
#this is the final labels as is
labels = user_ratings

#The features will be 80% of the users ratings concatenated with the user_info

# create a mask of 0 and 1 values where half are 0 and 0.8 (Default) are 1. 
#The ratio of masked values is something that can and should be optimized. 
mask_magnitude = 1.3
random_mask = np.clip((np.random.randn(1682) + mask_magnitude).round(), a_max = 1, a_min = 0)

user_ratings *= random_mask
features = np.concatenate((user_info, user_ratings), axis=1)

# Create model and dataset classes

In [4]:
from torch.utils.data import Dataset, DataLoader
from torch import nn 
import torch
from sklearn.metrics import precision_recall_fscore_support
device = "cuda"

class MovielensDataset(Dataset):
    def __init__(self, X, y):
        # input_noise The variance of the noise 
        
        
        self.X = X
        self.y = y
        
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


#noise layers for regularization
class GaussianNoise(nn.Module):
    def __init__(self, stddev):
        super().__init__()
        self.stddev = stddev

    def forward(self, x):
        if self.training:
            return x + torch.autograd.Variable(torch.randn(x.size()).cuda() * self.stddev)
        return x
    

# This is a standard VAE but using MSE as the reconstruction term. 

#Actually just rename this out and make loss function a parameter
class VAE(nn.Module):
    def __init__(self, recon_loss_fcn = "MSE", residual_user_info = False, dropout_rate = 0.0, nonLinearity = "LeakyRelu", fixed_variance = False, deterministicEval = False, noiseLayerStd = 0.2, hidden_size =1024, latent_size = 512):
        #fixed variance is false if regular or equal to the value of the parameter
        super().__init__()
        
        if nonLinearity == "LeakyRelu":
            nonLin = nn.LeakyReLU
        elif nonLinearity == "Relu":
            nonLin = nn.ReLU
        elif nonLinearity == "Tanh":
            nonLin = nn.Tanh
        elif nonLinearity == "Sigmoid":
            nonLin = nn.Sigmoid
        
        self.encoder = nn.Sequential(
            #Encoder
            nn.Linear(1703, hidden_size),
            nonLin(),
            GaussianNoise(noiseLayerStd),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size,hidden_size),
            nonLin(),
            GaussianNoise(noiseLayerStd),
            nn.Dropout(dropout_rate),
        )
        self.decoder = nn.Sequential(
            #Decoder
            nn.Linear(latent_size,hidden_size),
            nonLin(),
            GaussianNoise(noiseLayerStd),
            nn.Dropout(dropout_rate),
        )
        # distribution parameters
        self.fc_mu = nn.Linear(hidden_size, latent_size)
        self.fc_var = nn.Linear(hidden_size, latent_size)
        final_layer_size = hidden_size
        self.residual_user_info = residual_user_info
        if self.residual_user_info == True:
            final_layer_size += 22
        self.final_layer = nn.Linear(final_layer_size,1682)
        self.final_activation = nn.Tanh()
        
        
        self.log_scale = nn.Parameter(torch.Tensor([0.0]))
        #for evaluation purposes
        self.test_mse = 1000
        self.mse_loss_fcn = nn.MSELoss()
        self.recon_loss_fcn = recon_loss_fcn
        
        # for varieties of VAE
        self.fixed_variance = fixed_variance
        self.deterministicEval = deterministicEval
        
    def gaussian_likelihood(self, x_hat, logscale, x):
        scale = torch.exp(logscale)
        mean = x_hat
        dist = torch.distributions.Normal(mean, scale)

        # measure prob of seeing data under p(x|z)
        log_pxz = dist.log_prob(x)
        return log_pxz.sum(dim=1)
    
    def kl_divergence(self, z, mu, std):
        # --------------------------
        # Monte carlo KL divergence
        # --------------------------
        # 1. define the first two probabilities (in this case Normal for both)
        p = torch.distributions.Normal(torch.zeros_like(mu), torch.ones_like(std))
        q = torch.distributions.Normal(mu, std)

        # 2. get the probabilities from the equation
        log_qzx = q.log_prob(z)
        log_pz = p.log_prob(z)

        # kl
        kl = (log_qzx - log_pz)
        kl = kl.sum(-1)
        return kl
    
    def forward(self, x):
        
        
        
        
        # encode x to get the mu and variance parameters
        x_encoded = self.encoder(x)
        mu, log_var = self.fc_mu(x_encoded), self.fc_var(x_encoded) if self.fixed_variance == False else self.fc_var(x_encoded) / self.fc_var(x_encoded) * self.fixed_variance
        
        # sample z from q
        std = torch.exp(log_var / 2)
        if self.training == False and self.deterministicEval == True:
            std = std * 0
    
        #perform the kernel trick to allow for backprop through sampling
        epsilon = torch.distributions.Normal(0, 1).rsample()
        z = mu + epsilon * std
        # decoded
        if self.residual_user_info == True:
            final_layer_input = torch.concat((self.decoder(z), x[:, :22]), axis = 1)
        else:
            final_layer_input = self.decoder(z)
        ratings = self.final_layer(final_layer_input) 
        ratings = self.final_activation(ratings) * 2
        return ratings, z, mu, std
    
    def vae_loss(self, x_hat, x, z, mu, std):
        # reconstruction loss
        if self.recon_loss_fcn == "MSE":
            recon_loss = self.mse_loss_fcn(x_hat, x) * 10000
        else: 
            recon_loss = -self.gaussian_likelihood(x_hat, self.log_scale, x)
        
        # kl
        if self.training == True and self.fixed_variance == False:
            kl = self.kl_divergence(z, mu, std)
        else:
            kl = 0
        
        # elbo
        elbo = (kl + recon_loss).mean()

        return elbo

def train(dataloader, model,  optimizer, epoch):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X,y) in enumerate(dataloader):
        X, y = X.to(device).to(torch.float32), y.to(device).to(torch.float32)
        
        #compute prediction error
        pred, z, mu, std = model(X)
        loss = model.vae_loss(pred, y, z, mu, std)
        mse_loss = mse_loss_fcn(pred, y)
        
        #Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if epoch % 100 == 0 and batch % 64 == 0:
            loss , current = loss.item(), (batch+1) * len(X)
            #print("Epoch : " + str(epoch))
            #print(f"loss: {loss:>7f}")
            #print(f"MSE loss: {mse_loss:>7f}")
            losses.append(loss)
            mse_losses.append(mse_loss.item())

def test(dataloader, model, epoch):
    size = len(dataloader.dataset)
    num_batches= len(dataloader)
    model.eval()
    test_loss, correct, test_mse_loss = 0,0,0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device).to(torch.float32), y.to(device).to(torch.float32)
            
            
            pred, z, mu, std = model(X)
            test_loss += model.vae_loss(pred, y, z, mu, std).item()
            test_mse_loss += mse_loss_fcn(pred, y)
    test_loss /= num_batches
    test_mse_loss /= num_batches
    if epoch % 100 ==0:
        test_losses.append(test_loss)
        #Calculate RMSE on just the rated ones like the papers do 
        nonzero_indices = y.nonzero().split( 1, dim=1)
        relevant_rmse = torch.sqrt(mse_loss_fcn(y[nonzero_indices], pred[nonzero_indices]))#torch.sqrt((y[nonzero_indices] - pred[nonzero_indices] ** 2).mean())
        
        #print(f"Test Error: \n Avg Loss : {test_loss:>8f} ")
        #print(f" Test MSE loss: {test_mse_loss:>7f}")
        #print("Relevant RMSE loss: " + str(relevant_rmse.item()))
        test_mse_losses.append(test_mse_loss.item())
        model.test_mse = relevant_rmse.item()
    
        
        

# Load model and test performance

In [5]:
from sklearn.utils import shuffle

model = VAE(recon_loss_fcn = "MSE", residual_user_info = True, nonLinearity = "Tanh", dropout_rate = 0.1, fixed_variance = 0.3, deterministicEval = True, noiseLayerStd = 0.0, hidden_size = 4096, latent_size = 512 ).to(device)
model.load_state_dict(torch.load("./models/real_best.pth"))
model.eval()

X, y = shuffle(features, labels, random_state=1)

test_inputs = X[843:]
relevance_labels = y[843:] > 0.5

with torch.no_grad():
    test_preds = model(torch.tensor(test_inputs).to(device).to(torch.float32))[0]
    nonzero_indices = torch.tensor(y[843:]).to("cuda").nonzero().split( 1, dim=1)
    print("RMSE: ")
    print(torch.sqrt(((torch.tensor(y[843:]).to("cuda")[nonzero_indices] - test_preds[nonzero_indices]) ** 2).mean()))
    print("Precision and Recall: ")
    test_pred_labels = test_preds > .4
    print(precision_recall_fscore_support(relevance_labels.flatten(), test_pred_labels.cpu().flatten(), average="binary"))

RMSE: 
tensor(0.8383, device='cuda:0', dtype=torch.float64)
Precision and Recall: 
(0.7470588235294118, 0.7298850574712644, 0.7383720930232558, None)


# Demo Program Setup

In [6]:
def normalize_age(age):
    age -= 34.05196182396607
    age /= 12.1927397
    return age

movie_info = pd.read_csv("./data/Movielens100/u.item", sep = "|", encoding='latin-1', names= ["movie id" , "movie title", "release date", "video release date",
                                                                                        "imdb_url", "unknown", "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime",
                                                                                        "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", 
                                                                                        "War", "Western"], index_col = 0)
movie_info.index -= 1
movie_genres = movie_info.drop(["movie title", "release date", "video release date", "imdb_url"], axis=1)
movie_genres = movie_genres.to_numpy()


In [7]:
age = input("Enter your age \n")
age = normalize_age(int(age))
print(age)
sex = input("Enter your sex (M/F) \n")
sex = 1 if sex == "M" else 0
print(sex)
ratings = {}
user_ratings = np.zeros(1682)
num_ratings = 0
while(True):
    movie_index = np.random.randint(1682)
    response = input("How do you feel about " + movie_info.loc[movie_index]["movie title"] + " ? (1 to 5, 3 if unfamiliar) \n")
    if (response == "quit"):
        break
    else:
        response = int(response)
        response -= 3
        print(response)
        user_ratings[movie_index] = response
        num_ratings += 1 
user_tastes = np.dot(movie_genres.T, user_ratings) / num_ratings
pd.DataFrame([user_tastes], columns = ["unknown", "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime","Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"])

Enter your age 
27
-0.5783738517739427
Enter your sex (M/F) 
M
1
How do you feel about Secret of Roan Inish, The (1994) ? (1 to 5, 3 if unfamiliar) 
3
0
How do you feel about Family Thing, A (1996) ? (1 to 5, 3 if unfamiliar) 
3
0
How do you feel about Jupiter's Wife (1994) ? (1 to 5, 3 if unfamiliar) 
3
0
How do you feel about One Night Stand (1997) ? (1 to 5, 3 if unfamiliar) 
5
2
How do you feel about Walking Dead, The (1995) ? (1 to 5, 3 if unfamiliar) 
5
2
How do you feel about Grace of My Heart (1996) ? (1 to 5, 3 if unfamiliar) 
5
2
How do you feel about 20,000 Leagues Under the Sea (1954) ? (1 to 5, 3 if unfamiliar) 
5
2
How do you feel about Family Thing, A (1996) ? (1 to 5, 3 if unfamiliar) 
5
2
How do you feel about Being There (1979) ? (1 to 5, 3 if unfamiliar) 
quit


Unnamed: 0,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.0,0.25,0.0,0.25,0.5,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0


In [46]:
user_data = np.array([age, sex])
user_info = np.concatenate((user_data, user_tastes))
demo_features = np.concatenate((user_info, user_ratings))
demo_features = np.array([demo_features])

In [47]:
with torch.no_grad():
    test_preds = model(torch.tensor(demo_features).to(device).to(torch.float32))[0]
test_preds = test_preds.cpu().numpy()[0]
test_preds

array([ 2.26588726e-01, -2.84439206e-01, -1.06312215e-01, ...,
       -3.24324071e-02,  1.38729000e-02,  4.37286217e-05], dtype=float32)

In [48]:
print("Your top 10 rated movies are: ")
top10_indices = np.flip(np.argsort(test_preds)[-10:])
movie_info.loc[top10_indices]["movie title"]

Your top 10 rated movies are: 


movie id
290                                 Absolute Power (1997)
468                                     Short Cuts (1993)
255     When the Cats Away (Chacun cherche son chat) (...
321                                 Murder at 1600 (1997)
180                             Return of the Jedi (1983)
301                              L.A. Confidential (1997)
1382       Second Jungle Book: Mowgli & Baloo, The (1997)
686                                  McHale's Navy (1997)
159                            Glengarry Glen Ross (1992)
124                                     Phenomenon (1996)
Name: movie title, dtype: object