In [9]:
import json
import numpy as np
import pandas as pd
from app import dataset_word2vec, dataset_tfid
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
new_feats = None

In [11]:
def get_diversity(solutions):
    sim = cosine_similarity(solutions, solutions)
    np.fill_diagonal(sim, 0)
    return (1 - sim).sum(axis=1) * 1/(solutions.shape[0] - 1)

def get_novelty(solutions, data):
    sim = cosine_similarity(solutions, data)
    
    return (1-sim).max(axis=1) #* 1/(data.shape[0] - 1)
def evaluate(user, df_movies, df_ratings, index, data):
    df_ratings_u = df_ratings[df_ratings['userId'] == user].set_index('movieId')['rating']
    test = df_ratings_u[index[str(user)]['test']]
    y_true = test.copy()
    y_true[y_true <= 3] = 0
    y_true[y_true > 3] = 1
    y_pred = pd.Series(np.zeros(test.shape[0]), index=test.index, dtype=int)
    y_pred[data[str(user)]] = 1
    solutions = df_movies.iloc[data[str(user)]].drop(columns=['title'])
    train_data  = df_movies.iloc[index[str(user)]['train']].drop(columns=['title'])
    res = {}
    res['size_train'] = len(index[str(user)]['train'])
    res['size_test'] = len(index[str(user)]['test'])
    res['precision'] = precision_score(y_true, y_pred)
    res['recall'] = recall_score(y_true, y_pred)
    res['diversity'] = get_diversity(solutions).mean()
    res['novelty'] = get_novelty(solutions, train_data).mean()
    return res

In [12]:
n_features = 150
combination = 10
method = 'w2v'

In [13]:
import itertools
experiments_src = './app/recomendacoes/'+ method +'-'+ str(n_features) + '-'+str(combination) + '.txt'
all_feats = ['genres', 'rating', 'runtimes', 'year']
combinations = []
for i in range(5):
    for c in itertools.combinations(all_feats,i):
        combinations.append(list(c))

In [16]:
def get_movies(method, new_feats, combinations, combination):
    features = combinations[combination-1]
    if method == 'tfid':
        content = dataset_tfid
    if method == 'w2v':
        content = dataset_word2vec
        
    if new_feats is  None:
        
            df_movies, new_feats = content(features, op='sum', n_features=n_features, n_words=n_features)
            
    else:
        df_movies, new_feats = content(features, op='sum', n_features=n_features, new_feats=new_feats,n_words=n_features)
    return df_movies, new_feats
        
def get_rec(src_folder,name,  n_features, combination, model, nsga_iteractions):
    data = {}
    experiments_src = src_folder + name + '-'+str(n_features) + '-'+str(combination) + '-' + model + '-'+ str(nsga_iteractions) + '.txt'
    #experiments_src = './app/recomendacoes/cf-recomendacoes.txt'
    with open(experiments_src) as json_file:
        data = json.load(json_file)
        return data

In [18]:
df_movies, new_feats = get_movies(method, new_feats, combinations, combination)

Time to build vocab: 0.02 mins
Time to train the model: 1.04 mins
Time to compute vectors: 0.51 mins


In [19]:
df_ratings = pd.read_table('./app/datasets/ml-1m/ratings.dat', delimiter='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')

In [27]:
index = {}
with open('./app/datasets/index_sample.txt') as json_file:
    index = json.load(json_file)
index.keys()
del index['last_user']
index.keys()

dict_keys(['3315', '4837', '3158', '3726', '1887', '5841', '2750', '5709', '5193', '816', '558', '3169', '2395', '4084', '2', '35', '1224', '5607', '4053', '3563', '5466', '1162', '1361', '2564', '5831'])

In [21]:
recs = get_rec(n_features, combination)
recs.keys()


TypeError: get_rec() missing 4 required positional arguments: 'n_features', 'combination', 'model', and 'nsga_iteractions'

In [39]:
def get_results(src_folder, name, nsga_iteractions, model, n_features, combination):
    recs = get_rec(src_folder, name, n_features, combination, model, nsga_iteractions)
    results = pd.DataFrame([], columns=['size_train','size_test','precision', 'recall', 'diversity','novelty'])
    if 'None' in recs.keys():
        del recs['None']
    df_movies, new_feats = get_movies(name, None, combinations, combination)
    for user in recs.keys():
        rec = evaluate(int(user), df_movies, df_ratings, index, recs)
        results = results.append(rec, ignore_index=True)

    results.index = list(recs.keys())
    #print('./app/recomendacoes/experiments/results/'+ method + '-'+ str(n_features) + '-'+str(combination) + '.csv')
    #results.to_csv('./app/recomendacoes/experiments/results/'+ method + '-'+ str(n_features) + '-'+str(combination) + '.csv')
    return results
#results = get_results(n_features, combination)

In [40]:
#results = pd.read_csv('./app/recomendacoes/experiments/results/'+ method + '+ str(n_features) + '-' +str(combination) + '.csv', index_col=0)
src_folder = './app/recomendacoes/experiments/moea-rs/'
name = 'w2v'
nsga_iteractions = 200
model ='ridge'
variations = [('w2v','ridge', 200, 150, 10), ('w2v','ridge', 300, 150, 10),
              ('w2v','ridge', 200, 250, 9), ('w2v','ridge', 300, 250, 9), 
              ('w2v','gbr', 200, 250, 13), ('w2v','gbr', 300, 250, 13),
              ('w2v','ridge', 200, 150, 15), ('w2v','ridge', 300, 150, 15),
              ('w2v','ridge', 200, 150, 9), ('w2v','ridge', 300, 150, 9)]
for name, model, nsga_iteractions, n_features, combination in variations:
    print('name:', name, 'model:', model, 'nsga_iteractions:', nsga_iteractions, 'n_features:',n_features)
    results = get_results(src_folder, name, nsga_iteractions, model, n_features, combination)
    print(results.describe().loc[['mean', 'max'], ['precision', 'recall', 'diversity','novelty']])

name: w2v model: ridge nsga_iteractions: 200 n_features: 150
Time to build vocab: 0.02 mins
Time to train the model: 1.07 mins
Time to compute vectors: 0.54 mins
      precision    recall  diversity   novelty
mean   0.726061  0.219244   0.093284  0.050929
max    1.000000  0.476190   0.114669  0.142721
name: tfid model: ridge nsga_iteractions: 200 n_features: 2000
      precision    recall  diversity   novelty
mean   0.703333  0.216747   0.093955  0.149922
max    1.000000  0.400000   0.165938  0.402239


In [34]:
results.describe().loc[['mean', 'max'], ['precision', 'recall', 'diversity','novelty']]

Unnamed: 0,precision,recall,diversity,novelty
mean,0.726061,0.219244,0.093287,0.05101
max,1.0,0.47619,0.114705,0.142936


In [41]:
for i in range(16, 17):
    df_movies, new_feats = get_movies(new_feats, combinations, i)
    results = get_results(n_features, i)
    print('Combinação:', i)
    print(results.describe().loc['mean', ['precision', 'recall', 'diversity','novelty']])
    print('------------------------------')

Combinação: 1
precision    0.661087
recall       0.211077
diversity    0.097062
novelty      0.046098
Name: mean, dtype: float64
------------------------------
Combinação: 2
precision    0.639331
recall       0.208399
diversity    0.098184
novelty      0.163688
Name: mean, dtype: float64
------------------------------


  after removing the cwd from sys.path.


Combinação: 3
precision    0.661167
recall       0.213234
diversity         inf
novelty      0.369050
Name: mean, dtype: float64
------------------------------
Combinação: 4
precision    0.632167
recall       0.205201
diversity    0.093548
novelty      0.047371
Name: mean, dtype: float64
------------------------------
Combinação: 5
precision    0.640670
recall       0.204563
diversity    0.112608
novelty      0.655535
Name: mean, dtype: float64
------------------------------
Combinação: 6
precision    0.651209
recall       0.210108
diversity    0.101647
novelty      0.378178
Name: mean, dtype: float64
------------------------------
Combinação: 7
precision    0.653948
recall       0.207473
diversity    0.098317
novelty      0.163604
Name: mean, dtype: float64
------------------------------
Combinação: 8
precision    0.643054
recall       0.203777
diversity    0.117209
novelty      0.653232
Name: mean, dtype: float64
------------------------------
Combinação: 9
precision    0.646947
reca

In [13]:
n_features

250