In [1]:
import json
import numpy as np
import pandas as pd
from app import dataset_word2vec, dataset_tfid
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
new_feats = None

In [13]:
def get_diversity(solutions):
    sim = cosine_similarity(solutions, solutions)
    np.fill_diagonal(sim, 0)
    return (1 - sim).sum(axis=1) * 1/(solutions.shape[0] - 1)

def get_novelty(solutions, data):
    sim = cosine_similarity(solutions, data)
    
    return (1-sim).max(axis=1) #* 1/(data.shape[0] - 1)
def evaluate(user, df_movies, df_ratings, index, data):
    df_ratings_u = df_ratings[df_ratings['userId'] == user].set_index('movieId')['rating']
    test = df_ratings_u[index[str(user)]['test']]
    y_true = test.copy()
    y_true[y_true <= 3] = 0
    y_true[y_true > 3] = 1
    y_pred = pd.Series(np.zeros(test.shape[0]), index=test.index, dtype=int)
    y_pred[data[str(user)]] = 1
    solutions = df_movies.iloc[data[str(user)]].drop(columns=['title'])
    train_data  = df_movies.iloc[index[str(user)]['train']].drop(columns=['title'])
    res = {}
    res['size_train'] = len(index[str(user)]['train'])
    res['size_test'] = len(index[str(user)]['test'])
    res['precision'] = precision_score(y_true, y_pred)
    res['recall'] = recall_score(y_true, y_pred)
    res['diversity'] = get_diversity(solutions).mean()
    res['novelty'] = get_novelty(solutions, train_data).mean()
    return res

In [14]:
n_features = 2000
combination = 16
method = 'tfid'

In [15]:
import itertools
experiments_src = './app/recomendacoes/'+ method +'-'+ str(n_features) + '-'+str(combination) + '.txt'
all_feats = ['genres', 'rating', 'runtimes', 'year']
combinations = []
for i in range(5):
    for c in itertools.combinations(all_feats,i):
        combinations.append(list(c))

In [23]:
def get_movies(new_feats, combinations, combination):
    features = combinations[combination-1]
    if method == 'tfid':
        content = dataset_tfid
    if method == 'w2v':
        content = dataset_w2v
        
    if new_feats is  None:
        
            df_movies, new_feats = content(features, op='sum', n_features=n_features, n_words=n_features)
            
    else:
        df_movies, new_feats = content(features, op='sum', n_features=n_features, new_feats=new_feats,n_words=n_features)
    return df_movies, new_feats
        
def get_rec(n_features, combination):
    data = {}
    #experiments_src = './app/recomendacoes/'+ method +'-'+ str(n_features) + '-'+str(combination) + '.txt'
    experiments_src = './app/recomendacoes/cf-recomendacoes.txt'
    with open(experiments_src) as json_file:
        data = json.load(json_file)
        return data

In [17]:
df_movies, new_feats = get_movies(new_feats, combinations, combination)

In [18]:
df_ratings = pd.read_table('./app/datasets/ml-1m/ratings.dat', delimiter='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')

In [24]:
index = {}
with open('./app/datasets/index.txt') as json_file:
    index = json.load(json_file)
index.keys()

dict_keys(['4169', '1680', '4277', '1941', '1181', '889', '3618', '2063', '1150', '1015', '5795', '4344', '1980', '2909', '1449', '4510', '424', '4227', '5831', '3391', '3841', '4508', '1088', '5367', '3808', '549', '1285', '3224', '3539', '4543', '5643', '1448', '752', '3032', '3824', '524', '4448', '4064', '4725', '1010', '5954', '678', '1447', '4447', '2116', '550', '1605', '3526', '1880', '1698', '1912', '3778', '4808', '3292', '4425', '1019', '3507', '3311', '881', '2181', '6016', '2015', '2106', '4647', '5812', '2820', '6036', '710', '4312', '1647', '4979', '2777', '4386', '5026', '2304', '352', '531', '5046', '1676', '1051', '3163', '3626', '2507', '1958', '5100', '5394', '3675', '2907', '5636', '4041', '869', '5333', '1733', '5614', '3401', '3272', '4085', '3067', '2665', '5788', '482', '5256', '3471', '3829', '1383', '195', '4238', '3792', '5627', '2529', '1340', '5111', '1317', '1613', '2857', '216', '3705', '1635', '2793', '302', '3650', '4682', '1837', '4482', '1425', '1737

In [25]:
recs = get_rec(n_features, combination)
recs.keys()

dict_keys(['4169', '1680', '4277', '1941', '1181', '889', '3618', '2063', '1150', '1015', '5795', '4344', '1980', '2909', '1449', '4510', '424', '4227', '5831', '3391', '3841', '4508', '1088', '5367', '3808', '549', '1285', '3224', '3539', '4543', '5643', '1448', '752', '3032', '3824', '524', '4448', '4064', '4725', '1010', '5954', '678', '1447', '4447', '2116', '550', '1605', '3526', '1880', '1698', '1912', '3778', '4808', '3292', '4425', '1019', '3507', '3311', '881', '2181', '6016', '2015', '2106', '4647', '5812', '2820', '6036', '710', '4312', '1647', '4979', '2777', '4386', '5026', '2304', '352', '531', '5046', '1676', '1051', '3163', '3626', '2507', '1958', '5100', '5394', '3675', '2907', '5636', '4041', '869', '5333', '1733', '5614', '3401', '3272', '4085', '3067', '2665', '5788', '482', '5256', '3471', '3829', '1383', '195', '4238', '3792', '5627', '2529', '1340', '5111', '1317', '1613', '2857', '216', '3705', '1635', '2793', '302', '3650', '4682', '1837', '4482', '1425', '1737

In [26]:
def get_results(n_features, combination):
    recs = get_rec(n_features, combination)
    results = pd.DataFrame([], columns=['size_train','size_test','precision', 'recall', 'diversity','novelty'])
    user = list(recs.keys())[0]
    for user in recs.keys():
        rec = evaluate(int(user), df_movies, df_ratings, index, recs)
        results = results.append(rec, ignore_index=True)

    results.index = list(recs.keys())
    #print('./app/recomendacoes/experiments/results/'+ method + '-'+ str(n_features) + '-'+str(combination) + '.csv')
    #results.to_csv('./app/recomendacoes/experiments/results/'+ method + '-'+ str(n_features) + '-'+str(combination) + '.csv')
    return results
results = get_results(n_features, combination)

In [22]:
#results = pd.read_csv('./app/recomendacoes/experiments/results/'+ method + '+ str(n_features) + '-' +str(combination) + '.csv', index_col=0)
results.describe().loc[['mean', 'max'], ['precision', 'recall', 'diversity','novelty']]

Unnamed: 0,precision,recall,diversity,novelty
mean,0.642416,0.213342,0.096508,0.182812
max,1.0,0.666667,0.500094,0.402301


In [27]:
results.describe().loc[['mean', 'max'], ['precision', 'recall', 'diversity','novelty']]

Unnamed: 0,precision,recall,diversity,novelty
mean,0.778657,0.32564,0.074396,0.182244
max,1.0,1.0,0.149539,0.402298


In [41]:
for i in range(16, 17):
    df_movies, new_feats = get_movies(new_feats, combinations, i)
    results = get_results(n_features, i)
    print('Combinação:', i)
    print(results.describe().loc['mean', ['precision', 'recall', 'diversity','novelty']])
    print('------------------------------')

Combinação: 1
precision    0.661087
recall       0.211077
diversity    0.097062
novelty      0.046098
Name: mean, dtype: float64
------------------------------
Combinação: 2
precision    0.639331
recall       0.208399
diversity    0.098184
novelty      0.163688
Name: mean, dtype: float64
------------------------------


  after removing the cwd from sys.path.


Combinação: 3
precision    0.661167
recall       0.213234
diversity         inf
novelty      0.369050
Name: mean, dtype: float64
------------------------------
Combinação: 4
precision    0.632167
recall       0.205201
diversity    0.093548
novelty      0.047371
Name: mean, dtype: float64
------------------------------
Combinação: 5
precision    0.640670
recall       0.204563
diversity    0.112608
novelty      0.655535
Name: mean, dtype: float64
------------------------------
Combinação: 6
precision    0.651209
recall       0.210108
diversity    0.101647
novelty      0.378178
Name: mean, dtype: float64
------------------------------
Combinação: 7
precision    0.653948
recall       0.207473
diversity    0.098317
novelty      0.163604
Name: mean, dtype: float64
------------------------------
Combinação: 8
precision    0.643054
recall       0.203777
diversity    0.117209
novelty      0.653232
Name: mean, dtype: float64
------------------------------
Combinação: 9
precision    0.646947
reca

In [13]:
n_features

250