In [21]:
import json
import numpy as np
import pandas as pd
from app import dataset_word2vec, dataset_tfid
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
new_feats = None

In [23]:
def get_diversity(solutions):
    sim = cosine_similarity(solutions, solutions)
    np.fill_diagonal(sim, 0)
    return (1 - sim).sum(axis=1) * 1/(solutions.shape[0] - 1)

def get_novelty(solutions, data):
    sim = cosine_similarity(solutions, data)
    
    return (1-sim).max(axis=1) #* 1/(data.shape[0] - 1)
def evaluate(user, df_movies, df_ratings, index, data):
    df_ratings_u = df_ratings[df_ratings['userId'] == user].set_index('movieId')['rating']
    test = df_ratings_u[index[str(user)]['test']]
    y_true = test.copy()
    y_true[y_true <= 3] = 0
    y_true[y_true > 3] = 1
    y_pred = pd.Series(np.zeros(test.shape[0]), index=test.index, dtype=int)
    y_pred[data[str(user)]] = 1
    solutions = df_movies.iloc[data[str(user)]].drop(columns=['title'])
    train_data  = df_movies.iloc[index[str(user)]['train']].drop(columns=['title'])
    res = {}
    res['size_train'] = len(index[str(user)]['train'])
    res['size_test'] = len(index[str(user)]['test'])
    res['precision'] = precision_score(y_true, y_pred)
    res['recall'] = recall_score(y_true, y_pred)
    res['diversity'] = get_diversity(solutions).mean()
    res['novelty'] = get_novelty(solutions, train_data).mean()
    return res

In [24]:
n_features = 1500
combination = 1
method = 'tfid'

In [25]:
import itertools
experiments_src = './app/recomendacoes/experiments/'+ method +'-'+ str(n_features) + '-'+str(combination) + '.txt'
all_feats = ['genres', 'rating', 'runtimes', 'year']
combinations = []
for i in range(5):
    for c in itertools.combinations(all_feats,i):
        combinations.append(list(c))

In [26]:
def get_movies(new_feats, combinations, combination):
    features = combinations[combination-1]
    if method == 'tfid':
        content = dataset_tfid
    if method == 'w2v':
        content = dataset_w2v
        
    if new_feats is  None:
        
            df_movies, new_feats = content(features, op='sum', n_features=n_features, n_words=n_features)
        if method == 'w2v':
            
    else:
        df_movies, new_feats = content(features, op='sum', n_features=n_features, new_feats=new_feats,n_words=n_features)
    return df_movies, new_feats
        
def get_rec(n_features, combination):
    data = {}
    experiments_src = './app/recomendacoes/experiments/'+ method +'-'+ str(n_features) + '-'+str(combination) + '.txt'
    with open(experiments_src) as json_file:
        data = json.load(json_file)
        return data

In [7]:
df_movies, new_feats = get_movies(new_feats, combinations, combination)

Time to build vocab: 0.02 mins
Time to train the model: 1.1 mins
Time to compute vectors: 0.53 mins


In [27]:
df_ratings = pd.read_table('./app/datasets/ml-1m/ratings.dat', delimiter='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')

In [28]:
index = {}
with open('./app/datasets/index_sample.txt') as json_file:
    index = json.load(json_file)
index.keys()

dict_keys(['3609', '1883', '5734', '240', '4829', '2943', '5347', '2575', '2818', '2472', '4243', '3394', '1851', '3718', '3085', '3370', '2248', '5256', '1726', '5620', '1643', '4984', '5607', '3457', '175', '4088', '4732', '500', '5912', '2478', '133', '3517', '5886', '4918', '424', '1968', '2162', '1464', '3380', '5976', '3183', '4053', '2185', '3721', '2468', '2167', '2020', '4773', '2446', '1870', '3105', '5862', '728', '1606', '134', '3821', '3546', '1796', '2024', '4489', '5568', '2457', '5526', '385', '4532', '5270', '187', '4566', '1500', '1948', '4512', '5140', '5217', '5517', '23', '6023', '1072', '425', '2148', '1671', '3963', '3364', '1577', '5676', '4509', '1626', '5811', '5450', '4364', '3910', '4802', '2153', '6037', '2947', '1802', '268', '2935', '4908', '4621', '1329', 'last_user'])

In [29]:
recs = get_rec(n_features, combination)
recs.keys()

dict_keys(['3609', '1883', '5734', '240', '4829', '2943', '5347', '2575', '2818', '2472', '4243', '3394', '1851', '3718', '3085', '3370', '2248', '5256', '1726', '5620', '1643', '4984', '5607', '3457', '175', '4088', '4732', '500', '5912', '2478', '133', '3517', '5886', '4918', '424', '1968', '2162', '1464', '3380', '5976', '3183', '4053', '2185', '3721', '2468', '2167', '2020', '4773', '2446', '1870', '3105', '5862', '728', '1606', '134', '3821', '3546', '1796', '2024', '4489', '5568', '2457', '5526', '385', '4532', '5270', '187', '4566', '1500', '1948', '4512', '5140', '5217', '5517', '23', '6023', '1072', '425', '2148', '1671', '3963', '3364', '1577', '5676', '4509', '1626', '5811', '5450', '4364', '3910', '4802', '2153', '6037', '2947', '1802', '268', '2935', '4908', '4621', '1329'])

In [30]:
def get_results(n_features, combination):
    recs = get_rec(n_features, combination)
    results = pd.DataFrame([], columns=['size_train','size_test','precision', 'recall', 'diversity','novelty'])
    user = list(recs.keys())[0]
    for user in recs.keys():
        rec = evaluate(int(user), df_movies, df_ratings, index, recs)
        results = results.append(rec, ignore_index=True)

    results.index = list(recs.keys())
    results.to_csv('./app/recomendacoes/experiments/results/w2v-'+ str(n_features) + '-'+str(combination) + '.csv')
    return results
#results = get_results(n_features, combination)

In [12]:
results = pd.read_csv('./app/recomendacoes/experiments/results/w2v-' + str(n_features) + '-' +str(combination) + '.csv', index_col=0)
results.describe().loc['mean', ['precision', 'recall', 'diversity','novelty']]

precision    0.607319
recall       0.194157
diversity    0.097823
novelty      0.015717
Name: mean, dtype: float64

In [None]:
for i in range(1, 17):
    df_movies, new_feats = get_movies(new_feats, combinations, i)
    results = get_results(n_features, i)
    print('Combinação:', i)
    print(results.describe().loc['mean', ['precision', 'recall', 'diversity','novelty']])
    print('------------------------------')

Combinação: 1
precision    0.646260
recall       0.212763
diversity    0.103628
novelty      0.046088
Name: mean, dtype: float64
------------------------------
Combinação: 2
precision    0.643095
recall       0.208189
diversity    0.099753
novelty      0.163705
Name: mean, dtype: float64
------------------------------
Combinação: 3
precision    0.644725
recall       0.208814
diversity    0.101387
novelty      0.367858
Name: mean, dtype: float64
------------------------------
Combinação: 4
precision    0.665159
recall       0.216210
diversity    0.094839
novelty      0.047380
Name: mean, dtype: float64
------------------------------
Combinação: 5
precision    0.636771
recall       0.205162
diversity    0.126595
novelty      0.658996
Name: mean, dtype: float64
------------------------------
Combinação: 6
precision    0.632235
recall       0.202021
diversity    0.103215
novelty      0.379716
Name: mean, dtype: float64
------------------------------
Combinação: 7
precision    0.655363
reca

In [13]:
n_features

250