In [29]:
import pdb

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data

from torch.nn import functional

from torch.autograd import Variable

## Exploration des données
### Importation et reformatage des données associées aux usagers

In [2]:
users = pd.read_csv('../data/ml-100k/u.user', sep='|', header=None, engine='python', encoding='latin-1')

users_age = np.matrix(users.loc[:, 1])

users_sex = np.matrix(users.loc[:, 2])
users_sex[users_sex == 'M'] = 0
users_sex[users_sex == 'F'] = 1

users_occupation = np.array(pd.read_csv('../data/ml-100k/u.occupation', sep='|', header=None, engine='python', encoding='latin-1').loc[:, 0])
users_occupation = np.array(users.loc[:, 3])
occupation_matrix = np.zeros((len(users), len(users_occupation)))

for i in np.arange(len(users)): 
    i_occupation = users_occupation[i]
    
    for j in np.arange(len(users_occupation)):
        
        if i_occupation == users_occupation[j]:
            occupation_matrix[i, j] = 1
            break            
users_occupation = occupation_matrix

user_attributes = np.concatenate((users_sex, users_age, users_occupation.T)).T.tolist()


Exploration des données et statistiques descriptives

### Importation et reformatage des données associées aux films

In [3]:
movies = pd.read_csv('../data/ml-100k/u.item', sep='|', header=None, engine='python', encoding='latin-1')

movie_names = np.array(movies.loc[:, 1])
movies_genre = np.matrix(movies.loc[:, 5:])
movies_genre_names = np.array(pd.read_csv('../data/ml-100k/u.genre', sep='|', header=None, engine='python', encoding='latin-1').loc[:, 0])

Exploration des données et statistiques descriptives

### Importation et reformatage des données associées aux évaluations

Quelques fonctions utilitaires avant de commencer afin de pouvoir traiter les données.

In [4]:
def convert(data, nb_users, nb_movies):
    new_data = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data

def split(data, ratio, tensor=False):

    train, valid = np.zeros((len(data), len(data[0]))).tolist(), np.zeros((len(data), len(data[0]))).tolist()

    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] > 0:
                if np.random.binomial(1, ratio, 1):
                    train[i][j] = data[i][j]
                else:
                    valid[i][j] = data[i][j]

    return [train, valid]

Exploration des données et statistiques descriptives

### Créations des sous-ensembles d'entrainement et de validation

In [5]:
#  Preparing the training set and the test set
training_set = pd.read_csv('../data/ml-100k/u1.base', delimiter='\t')
training_set = np.array(training_set, dtype='int')
test_set = pd.read_csv('../data/ml-100k/u1.test', delimiter='\t')
test_set = np.array(test_set, dtype='int')

#  Getting the number of users and movies
nb_users = int(max(max(training_set[:, 0]), max(test_set[:, 0])))
nb_items = int(max(max(training_set[:, 1]), max(test_set[:, 1])))

train_set = convert(training_set, nb_users, nb_items)
test_set = convert(test_set, nb_users, nb_items)

#  Creation des ensembles d'entrainement et de validation
train = split(train_set, 0.8)

# Conversion des données sous forme de tenseurs
train = torch.FloatTensor(train)
test = torch.FloatTensor(test_set)

Exploration des données et statistiques descriptives

## Système de recommendation basé sur des architectures d'apprentissage profond

Mettre commentaires

### Classe d'auto-encodeurs (AE)

In [10]:
class AE(nn.Module):

    def __init__(self, nb_movies):
        super(AE, self).__init__()

        self.fc1 = nn.Linear(nb_movies, 10)
        self.fc2 = nn.Linear(10, nb_movies)

        self.activation = nn.Sigmoid()

    def forward(self, x):
        latent = self.activation(self.fc1(x))
        return self.fc2(latent)

### Classe d'auto-encodeurs variationnels (VAE)

In [91]:
class VAE(nn.Module):
    def __init__(self, nb_movies):
        super(VAE, self).__init__()

        self.fc1 = nn.Linear(nb_movies, 100)
        self.fc21 = nn.Linear(100, 20)
        self.fc22 = nn.Linear(100, 20)
        self.fc3 = nn.Linear(20, 100)
        self.fc4 = nn.Linear(100, nb_movies)
        
        self.mu = None
        self.logvar = None
        
    def encode(self, x):
        h1 = functional.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z): 
        h3 = functional.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))

    def forward(self, x):
        self.mu, self.logvar = self.encode(x)
        z = self.reparameterize(self.mu, self.logvar)
        return self.decode(z)
    
    def loss_function(self, recon_x, x, criterion):

        LLK = criterion(recon_x, x)
        KLD = -0.5 * torch.sum(1 + self.logvar - self.mu.pow(2) - self.logvar.exp())
        return LLK + KLD

In [92]:
learning_rate = 0.002
nb_epoch = 1
regularization = 0.02
stop_crit = 0.002

In [93]:
nb_movies = len(movies)
nb_users = len(users)
vae = VAE(nb_movies)
criterion = nn.MSELoss()
optimizer = optim.Adam(vae.parameters(), lr=learning_rate)

In [96]:
epoch = 1 
train_loss = 0
s = 0.
id_user =0
input = Variable(train[0][id_user]).unsqueeze(0)
target = input.clone()
if torch.sum(target.data > 0) > 0:

    output = vae(input)
    target.require_grad = False  
    output[target == 0] = 0

    loss = vae.loss_function(output[0], target, criterion)
    loss.backward()

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1682]], which is output 0 of SigmoidBackward, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [95]:
loss

tensor(2.0063, grad_fn=<AddBackward0>)

#### Paramètres considérés

Mettre description

#### Hyperparamètres considérés

Mettre description

In [25]:
learning_rate = 0.002
nb_epoch = 2
regularization = 0.02
stop_crit = 0.002

nb_movies = len(movies)
nb_users = len(users)

### Initiatilisation et entrainement du modèle

Mettre description - Ça serait bien de superposer les différentes courbes en temps réel

In [22]:
ae = AE(nb_movies)
criterion = nn.MSELoss()
optimizer = optim.RMSprop(ae.parameters(), lr=learning_rate, weight_decay=regularization)

Boucle d'apprentissage

In [1]:
for epoch in range(1, nb_epoch + 1):
    
    train_loss = 0
    s = 0.
    
    for id_user in range(nb_users):
        input = Variable(train[0][id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:

            output = ae(input)
            target.require_grad = False  # pourquoi false?
            output[target == 0] = 0

            loss = criterion(output, target)

            #optimizer.zero_grad()  # pourquoi est-ce qu'il ne faut pas le mettre?
            loss.backward()  # calcul gradient
            optimizer.step()  # descente du gradient

            mean_corrector = nb_movies / float(torch.sum(target.data > 0) + 1e-10)
            train_loss += np.sqrt(loss.data * mean_corrector)
            s += 1.
    print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss / s))

NameError: name 'nb_epoch' is not defined

#### Évaluation finale sur l'ensemble de test

Mettre description

In [12]:
test_loss = 0
s = 0.

for id_user in range(nb_users):
    input = Variable(train[0][id_user]).unsqueeze(0)
    target = Variable(test[id_user]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data * mean_corrector)
        s += 1.
print('test loss: ' + str(test_loss/s))

NameError: name 'sae' is not defined

### Analyse des performances générales et visualisation des résultats

Visualisation des résultats: parler des prédictions en fonction:
1. des attributs de l'utilisateur (sexe, age occupation)
2. des genre de films

### Au-delà des statistiques, un exemple concret

Fonction pour effectuer les meilleures recommendations en fonction de certains critères (intégrés dans la classe). Dans un premier temps, nous pouvons suggérer les 'k' meilleures recommendations pour un usager en particulier. Naturellement, les recommendations faites ne suggèrent que des films non visionnés par l'usager.

In [None]:
user_id = 0
ratings = train_set[user_id]
top_what = 10

model.predict_instance(user_id, ratings, top_what)

En s'attardant un peu aux nouvelles recommendations faites, on peut identifier le comportement de l'usager et ses préférences en terme de genre.

En fait, ça pourrait être intéressant de proposer à l'usager des films en fonction de ces préférences du moment en fonction du genre.

In [None]:
genre = 'Action'
model.predict_instance(user_id, ratings, top_what, genre)

Peut-être même pourrions-nous sonder son incontient (lire les couches latentes du modèle) et de lui proposer de nouveaux films que lui-même n'imaginait pas aimer. Ces recommendations sont faites au-delà des genres explicitement définis dans le jeu de données initial. Pour plus de détail, voir la section supplément.

## Comparaison des différentes techniques

Présenter discussion

### Supplément - Dans la psyché du modèle (ou l'exploration des couches latentes)

Mettre commentaires

In [50]:
def rearrange(items, ratings):
    attribute, scores = [], []
    ranking = np.argsort(ratings)

    for k in np.arange(len(ranking)):
        attribute.append(items[ranking[k]])
        scores.append(ratings[ranking[k]])

    return attribute, scores

In [52]:
nb = 3

ratings = recfact.Q[:, 0]
movie_names = movies.loc[:, 1]
sorted_names, sorted_scores = rearrange(movie_names, ratings)

low_names, low_scores = sorted_names[:nb], sorted_scores[:nb]

high_names, high_scores = sorted_names[-nb:], sorted_scores[-nb:]

center = int(len(names)/2)
center_names, center_scores = sorted_names[(center): (center+1)], sorted_scores[(center): (center+1)]

scores = low_scores + center_scores + high_scores
names = low_names + center_names + high_names

## VAE

In [60]:
learning_rate = 0.002
nb_epoch = 1
regularization = 0.02
stop_crit = 0.002

In [61]:
nb_movies = len(movies)
nb_users = len(users)
vae = VAE(nb_movies)
criterion = nn.MSELoss()
optimizer = optim.Adam(vae.parameters(), lr=learning_rate)

In [65]:
epoch = 1 
train_loss = 0
s = 0.
id_user =0
input = Variable(train[0][id_user]).unsqueeze(0)
target = input.clone()
if torch.sum(target.data > 0) > 0:

    output = vae(input)
    target.require_grad = False  
    output[target == 0] = 0

    #loss = loss_function(output[0], target, output.mu, output.logvar, criterion)

NameError: name 'mu' is not defined

In [58]:
loss.backward()

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1682]], which is output 0 of SigmoidBackward, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [51]:
torch.autograd.set_detect_anomaly(False)

for epoch in range(1, nb_epoch + 1):
    
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(train[0][id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:

            output = vae(input)
            target.require_grad = False  
            output[0][target == 0] = 0
            
            loss = loss_function(output[0][0], target, output[1], output[2], criterion)
            print(loss)
            loss.backward(retain_graph=True)  
            print('a')
            print('ider user: ', id_user, 'loss: ', loss)
            optimizer.step()

            mean_corrector = nb_movies / float(torch.sum(target.data > 0) + 1e-10)
            train_loss += np.sqrt(loss.data * mean_corrector)
            s += 1.
    print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss / s))

tensor(1.1872, grad_fn=<AddBackward0>)


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1682]], which is output 0 of SigmoidBackward, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [52]:
loss.backward(retain_graph=True)

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 1682]], which is output 0 of SigmoidBackward, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).