In [1]:
#import required modules.

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset

In [67]:
class AutoEncoder(nn.Module):
    
    def __init__(self, input_size, output_size, latent_size):
        
        super(AutoEncoder, self).__init__()
        
        #encoder
        self.e1 = nn.Linear(input_size,256)
        self.e2 = nn.Linear(256,128)
        
        #Latent View
        self.lv = nn.Linear(128, latent_size)
        
        #Decoder
        self.d1 = nn.Linear(latent_size,64)
        self.d2 = nn.Linear(64,128)
        
        self.output_layer = nn.Linear(128,output_size)
        
    def forward(self,x):
        x = F.relu(self.e1(x))
        x = F.relu(self.e2(x))
        
        x = F.relu(self.lv(x))
        
        x = F.relu(self.d1(x))
        x = F.relu(self.d2(x))
        
        x = self.output_layer(x)
        return x
    
    def latent(self,x):
        x = F.relu(self.e1(x))
        x = F.relu(self.e2(x))
        
        x = F.relu(self.lv(x))
        return x

In [49]:
from app import dataset_word2vec, dataset_tfid
X, _ = dataset_tfid(['genres', 'rating', 'runtimes', 'year'], op='sum', n_features=2000, n_words=2000)
y, _ = dataset_word2vec([], op='sum', n_features=300)

Time to build vocab: 0.03 mins
Time to train the model: 1.13 mins
Time to compute vectors: 0.66 mins


In [50]:
X['year']

0        0.818898
1        0.818898
2        0.818898
3        0.818898
4        0.818898
           ...   
27273    0.913386
27274    0.874016
27275    0.968504
27276    0.866142
27277    0.960630
Name: year, Length: 26585, dtype: float64

In [51]:
X_train = X.drop(columns=['title']).set_index('movieId')
y_train = y.drop(columns=['title']).set_index('movieId')

In [94]:
ae = AutoEncoder(X_train.shape[1], y_train.shape[1], 36)
ae

AutoEncoder(
  (e1): Linear(in_features=4705, out_features=256, bias=True)
  (e2): Linear(in_features=256, out_features=128, bias=True)
  (lv): Linear(in_features=128, out_features=36, bias=True)
  (d1): Linear(in_features=36, out_features=64, bias=True)
  (d2): Linear(in_features=64, out_features=128, bias=True)
  (output_layer): Linear(in_features=128, out_features=300, bias=True)
)

In [78]:
trn_x_torch = torch.from_numpy(X_train.values).type(torch.FloatTensor)
trn_y_torch = torch.from_numpy(y_train.values).type(torch.FloatTensor)

In [79]:
trn = TensorDataset(trn_x_torch, trn_y_torch)
trn_dataloader = torch.utils.data.DataLoader(trn,batch_size=100,shuffle=False, num_workers=4)

In [95]:
#define our optimizer and loss function
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)

In [96]:
losses = []
EPOCHS = 50
for epoch in range(EPOCHS):
    
    for batch_idx, (data, target) in enumerate(trn_dataloader):
        data = torch.autograd.Variable(data)


        optimizer.zero_grad()

        pred = ae(data)
        loss = loss_func(pred, target)

        losses.append(loss.cpu().data.item())

        # Backpropagation
        loss.backward()

        optimizer.step()
        
        
        # Display
        if batch_idx % 100 == 1:
            print('\r Train Epoch: {}/{} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch+1,
                EPOCHS,
                batch_idx * len(data), 
                len(trn_dataloader.dataset),
                100. * batch_idx / len(trn_dataloader), 
                loss.cpu().data.item()), 
                end='')



In [97]:
latent_feats = ae.latent(trn_x_torch)
latent_feats = latent_feats.detach().numpy()
movies = pd.DataFrame(latent_feats, index=X.index)
movies['movieId'] = X.movieId
movies['title'] = X.title
#movies['timestamp'] = X.timestamp

In [98]:
from sklearn.metrics import precision_score, recall_score
from app.datasets import dataset_ratings_user
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, RidgeCV, BayesianRidge, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, precision_score, recall_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

def evaluate(X, user, index, df_ratings):
    df_movies_u = dataset_ratings_user(X,df_ratings=df_ratings, user=user) 
    #train, test = train_test_split(df_movies_u , test_size=0.2)
    train = df_movies_u.loc[index[str(user)]['train'], :]
    test = df_movies_u.loc[index[str(user)]['test'], :]
    X_train = train.drop(columns=['rating_user','title'])
    y_train = train['rating_user']
    X_test = test.drop(columns=['rating_user','title'])
    y_test = test['rating_user']
    df_movies_u = None
    return train_predict(X_train, y_train, X_test, y_test)




def train_predict(X_train, y_train, X_test, y_test):

    precision = {}
    recall = {}
    y_true = (y_test >= 3) * 1

    def get_precision_recall(name, model):
        model = model.fit(X_train, y_train)
        y_pred = (model.predict(X_test) >= 3)*1
        precision[name] = precision_score(y_true, y_pred)
        recall[name] = recall_score(y_true, y_pred)
    ridge = RidgeCV(cv=5)
    get_precision_recall('ridge', GridSearchCV(Ridge(), {'alpha':[1e-3, 1e-2, 1e-1, 1]}))
    #get_precision_recall('lasso', GridSearchCV(Lasso(), {'alpha':[1e-3, 1e-2, 1e-1, 1]}))
    get_precision_recall('svr', GridSearchCV(SVR(),{'kernel':('linear', 'rbf'), 'C':[1, 10]}))
    get_precision_recall('elastic', GridSearchCV(ElasticNet(), {'alpha':[1e-3, 1e-2, 1e-1, 1]}))
    #get_precision_recall('sgd', GridSearchCV(SGDRegressor(), {'alpha':[1e-3, 1e-2, 1e-1, 1]}))
    #get_precision_recall('ada',AdaBoostRegressor(random_state=0, n_estimators=100))
    #get_precision_recall('gbr',GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls'))
    return precision, recall


def evaluate_method1(X, users, index, df_ratings):
    precisions, recalls = [], []
    columns = ['ridge', 'lasso', 'elastic', 'svr', 'sgd']
    i = 0
    for user in users:
        precision, recall = evaluate(X, int(user), index, df_ratings)
        precisions.append(precision)
        recalls.append(recall)
        print('i:', i)
        i += 1
        
    df_precisions = pd.DataFrame(precisions, index=users, columns=columns)
    df_recalls = pd.DataFrame(recalls, index=users, columns=columns)
    return df_precisions, df_recalls

In [99]:
import  json
index = {}
with open('./app/datasets/index_sample.txt') as json_file:
    index = json.load(json_file)
len(index.keys())
df_ratings = pd.read_table('./app/datasets/ml-1m/ratings.dat', delimiter='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')
del index['last_user']

In [100]:
df_precisions, df_recalls = evaluate_method1(movies, index.keys(), index, df_ratings)



i: 0




i: 1




i: 2




i: 3




i: 4




i: 5




i: 6




i: 7




i: 8




i: 9




i: 10




i: 11


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


i: 12




i: 13




i: 14




i: 15




i: 16




i: 17


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


i: 18




i: 19




i: 20




i: 21




i: 22




i: 23




i: 24




i: 25




i: 26




i: 27
i: 28




i: 29




i: 30




i: 31




i: 32




i: 33




i: 34


  'precision', 'predicted', average, warn_for)


i: 35




i: 36




i: 37




i: 38




i: 39




i: 40




i: 41




i: 42




i: 43




i: 44


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


i: 45




i: 46




i: 47




i: 48




i: 49
i: 50




i: 51




i: 52




i: 53


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


i: 54


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


i: 55




i: 56




i: 57
i: 58




i: 59




i: 60




i: 61




i: 62




i: 63




i: 64




i: 65




i: 66




i: 67




i: 68




i: 69




i: 70




i: 71




i: 72




i: 73




i: 74




i: 75




i: 76




i: 77




i: 78




i: 79




i: 80




i: 81




i: 82




i: 83




i: 84




i: 85




i: 86




i: 87




i: 88




i: 89




i: 90




i: 91




i: 92




i: 93




i: 94




i: 95




i: 96




i: 97




i: 98
i: 99




In [102]:
df_recalls.mean()

ridge      0.923297
lasso           NaN
elastic    0.927787
svr        0.926404
sgd             NaN
dtype: float64