# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

# Data Preprocessing

### Loading Dataset

In [2]:
ratings = pd.read_csv('../data/rating.csv.zip')
animes = pd.read_csv('../data/anime.csv')

In [3]:
#Cleaning ratings to only allow values greater than -1
#Cleaning animes to only allow TV Shows
ratings = ratings[ratings.rating > -1]
animes = animes[animes.type == 'TV']
ratings = ratings[ratings.anime_id.isin(animes.anime_id)]

In [4]:
print animes.shape
print ratings.shape

(3787, 7)
(4364294, 3)


### Transform Dataset

In [5]:
#Getting the amount of differents users
nb_users = ratings.groupby("user_id").user_id.nunique().shape[0]
#Getting the amount of anime shows
nb_shows = animes.shape[0]
print nb_users
print nb_shows

68929
3787


In [None]:
#Rewritting ratings dataframe to ajust user_id in sequences
#We have 69600 users in ratings but we have 73516 different users id's
#We need to create a test set, but we'll have data loss if we don't have the user_id in sequences
actual_index = ratings.iloc[0,].user_id
new_index = 0
new_list_user_id = []
for i in range(ratings.shape[0]):
    if i % 120000 == 0:
        print"{0} %".format((i / float(ratings.shape[0])) * 100.)
    if ratings.iloc[i,].user_id != actual_index:
        new_index += 1
        actual_index = ratings.iloc[i,].user_id    
    new_list_user_id.append(new_index)
print actual_index
print new_index        

In [None]:
#Doing the same structure for anime data
mapped_anime_ids = {}
for i in range(animes.shape[0]):
    mapped_anime_ids[animes.iloc[i,].anime_id] = i

In [None]:
ratings.head()

In [None]:
for i in range(ratings.shape[0]):
    if i % 120000 == 0:
        print"{0} %".format((i / float(ratings.shape[0])) * 100.)
    ratings.iloc[i,].anime_id = mapped_anime_ids[ratings.iloc[i,].anime_id]

In [None]:
ratings.tail()

In [None]:
ratings['user_id'] = new_list_user_id

In [None]:
ratings.tail()

In [None]:
ratings.head()

### Splitting Data Set

In [None]:
#Exporting new data set.
ratings.to_csv('ratings_corrected.csv')

### Working with corrected data

In [6]:
ratings_corrected = pd.read_csv('./ratings_corrected.csv')

#### Creating Training and Test Set with Numpy Arrays

In [20]:
training_set, test_set = train_test_split(ratings_corrected, test_size=0.2, random_state=42)
training_set = training_set.drop(training_set.columns[[0]], axis=1)
test_set = test_set.drop(test_set.columns[[0]], axis=1)

In [21]:
training_set.head()

Unnamed: 0,user_id,anime_id,rating
3190395,50219,673,7
2013110,31228,432,8
252341,4139,432,6
3172058,49945,364,8
4193400,66048,432,10


In [22]:
def convert_set(data, n_rows, n_cols):
        process_set = []
        for i  in range(n_rows):
            rated = data[data[:, 0] == i]
            animes_rated = rated[:, 1]
            ratings_obtained = rated[:, 2]
            ratings = np.zeros(n_cols)
            ratings[animes_rated] = ratings_obtained
            process_set.append(list(ratings))
        return process_set 

In [23]:
training_set = np.array(training_set, dtype='int')

In [24]:
training_set[training_set[:, 0] == 0]

array([[  0, 408,  10],
       [  0, 568,  10],
       [  0, 882,  10]])

In [25]:
training_set = convert_set(training_set, nb_users, nb_shows)


KeyboardInterrupt


In [None]:
test_set = convert_set(test_set, nb_users, nb_shows)

# Creating Stacker AutoEncoder Model

### Creating SAE class

In [1]:
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from types import *
import numpy as np
import sys
from time import time

class SAE(nn.Module):
    def __init__(self, input_output_size, encoder_input=20, decoder_input=20):
        super(SAE, self).__init__()
        self.encoder = nn.Linear(input_output_size, encoder_input)
        self.hidden_layers = []
        self.decoder = nn.Linear(decoder_input, input_output_size)
        self.activation = nn.Sigmoid()
        self.last_out = encoder_input
        
    def forward(self, x):
        #First step
        x = self.activation(self.encoder(x))
        #We only handle Linear and Dropout layers
        for layer in self.hidden_layers:
            if "Linear" in str(type(layer)):
                #It's a linear layer
                x = self.activation(layer(x))
            else:
                #It's a dropout layer
                x = layer(x)
        #Final Step
        x = self.decoder(x)
        return x
    
    def add_hiden_layer(self, out_features):
        new_layer = nn.Linear(self.last_out, out_features)
        self.last_out = out_features
        self.hidden_layers.append(new_layer)
        
    def add_dropout(self, p=0.5):
        new_dropout = nn.Dropout(p)
        self.hidden_layers.append(new_dropout)
        
    def print_progress(self, message):
        sys.stdout.write("\r" + message)
        sys.stdout.flush()
    def compile(self, optimizer='rmsprop', criterion='mse', lr=0.01, weight_decay=0.5):
        if type(criterion) is StringType:
            if criterion == 'mse':
                self.criterion = nn.MSELoss()
            else:
                self.criterion = nn.L1Loss()
        else:
            self.criterion = criterion
        if optimizer == 'rmsprop':
            self.optimizer = optim.RMSprop(self.parameters(), lr=lr, weight_decay=weight_decay)
        elif optimizer == 'adadelta':
            self.optimizer = optim.Adadelta(self.parameters(), lr=lr, weight_decay=weight_decay)
        elif optimizer == 'adam':
            self.optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
        elif optimizer == 'sgd':
            self.optimizer = optim.SGD(self.parameters(), lr=lr, weight_decay=weight_decay)
        elif optimizer == 'adagrad':
            self.optimizer = optim.Adagrad(self.parameters(), lr=lr, weight_decay=weight_decay)           
            
            
    def fit(self, X, nb_epoch):
        init_time = time()
        for epoch in range(nb_epoch):
            train_loss = 0
            s = 0.
            rows, columns = X.size()
            init_iteration_time = time()
            init_training_time = time()
            for index in range(int(rows)):
                input = Variable(X[index]).unsqueeze(0)
                target = input.clone()
                if torch.sum(target.data > 0) > 0:
                    output = self.forward(input)
                    target.require_grad = False
                    output[target == 0] = 0
                    loss = self.criterion(output, target)
                    mean_corrector = columns/float(torch.sum(target.data > 0) + 1e-10)
                    loss.backward()
                    train_loss += np.sqrt(loss.data[0] * mean_corrector)
                    s += 1.
                    self.optimizer.step()
                end_training_time = time()
                self.print_progress("epoch: {0}/{1}, training: {2}/{3} - {4:.2%}, time: {5:.2f}s".format(epoch + 1, nb_epoch,index + 1, rows, (index + 1)/float(rows), end_training_time - init_training_time))
            end_iteration_time = time()
            self.print_progress('epoch: {0}/{1}, training loss: {2}, total epoch time: {3:.2f}s\n'.format(epoch + 1, nb_epoch, train_loss / s, end_iteration_time - init_iteration_time))
        end_time = time()
        self.print_progress("Total Training Time: {0:.2f}m".format((end_time - init_time) / float(60)))
            
    def perform(self, X, y):
        test_loss = 0
        s = 0.
        rows, columns = X.size()
        for index in range(int(rows)):
            input = Variable(X[index]).unsqueeze(0)
            target = Variable(y[index]).unsqueeze(0)
            if torch.sum(target.data > 0) > 0:
                output = self.forward(input)
                target.required_grad = False
                output[target == 0] = 0
                loss = self.criterion(output, target)
                mean_corrector = columns/float(torch.sum(target.data > 0) + 1e-10)
                test_loss += np.sqrt(loss.data[0] * mean_corrector)
                s += 1.
        print "prediction loss {0}".format(test_loss/s) 

    def predict(self, X):
        prediction = []
        rows, _ = X.size()
        for index in range(int(rows)):
            input = Variable(X[index]).unsqueeze(0)
            output = self.forward(input)
            prediction.append(output)
        return prediction
        
   

In [None]:
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

In [None]:
training_set = torch.FloatTensor(training_set)


In [None]:
nb_epoch = 200
for epoch in range(nb_epoch):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_shows/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data[0]*mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))


In [1]:
print "Hello World"

Hello World


In [12]:
from sklearn.externals import joblib

In [None]:
joblib.dump(training_set, 'tensor.pkl')