In [118]:
# LIBRARIES

import pandas as pd
import numpy as np
import itertools as it
from datetime import datetime
import networkx as nx
import math
# Documentacion de la libreria: http://networkx.readthedocs.io/en/networkx-1.11/

from operator import itemgetter
from itertools import groupby

In [119]:
# CONSTANTS 

#DATASET_SIZE = 100000
#HALF_DATASET_SIZE = int(DATASET_SIZE / 2)
RATING_THRESHOLD = 4
WEIGHT_THRESHOLD = 5
K = 10
MEASURES = ['aa', 'cn', 'ew', 'jn', 'pa', 'waa', 'wcn', 'wpa']

In [120]:
# FUNCTIONS AND SUBPROGRAMS

In [121]:
def compareNodes(f_list, s_list):
    """
        Function that returns the number of users that have interact with both items
        Funcion que devuelve el numero de usuarios que han interactuado con ambos items
    """
    peso = len(np.intersect1d(f_list, s_list))
    
    return peso
    
def createLinks(prob_us_set, nodos, threshold):
    """
        Function that creates graph links with the information about the set. The weight has to be grater or equal to threshold.
        
        Funcion que crea los enlaces del grafo a partir de la informacion contenida en el conjunto que se le
        pasa a la funcion. El peso tiene que ser mayor o igual al umbral.
        
        Format of links list -> [(Node1, Node2, weight), ......]
    """
    resultado = list() 
    
    # hago todas las posibles combinaciones de problemas
    for fst, snd in it.combinations(nodos, 2):
        # obtengo el peso pasando la lista de usuarios que ha hecho cada problema
        peso = compareNodes(prob_us_set[fst], prob_us_set[snd])
        if peso >= threshold:
            resultado.append((fst, snd, peso))
            
            
            
    return resultado

In [122]:
def create_graph_nx(list_nodes, list_links):
    """
        Function that creates a graph with the format from NetworkX 
        
        Funcion que crea un grafo de tipo Graph de la libreria NetworkX
        Construccion del grafo: http://networkx.readthedocs.io/en/networkx-1.11/tutorial/tutorial.html#what-to-use-as-nodes-and-edges
    """
    grafo = nx.Graph() # creo la variable grafo

    # incluyo los nodos del grafo 
    grafo.add_nodes_from(list_nodes)

    # se incluyen las tuplas de enlaces con el peso del enlace
    # es una lista de la forma [(Nodo1, Nodo2, peso), ......]
    grafo.add_weighted_edges_from(list_links)

    return grafo

In [123]:
def apply_aa(row, graph):
    """
        Funcion que devuelve para cada par de nodos, el sumatorio de 1/log(N(z)), siendo N(z) el grado del nodo z para todo z 
        perteneciente al conjunto de nodos en comun de ese par de nodos
    """
    
    # obtengo un iterador de un solo elemento que tiene en la tercera posicion el valor de AA para el par de nodos
    value = nx.adamic_adar_index(graph, [(row['one'], row['two'])])
    
    value_aa = 0
    for u, v, p in value:
        # itero el iterador, guardando el valor de adar adamic
        value_aa = p
    
    return value_aa

In [124]:
def apply_cn(row, graph):
    """
        Funcion que devuelve el numero de vecinos en comun de esos dos nodos
    """
    return len(list(nx.common_neighbors(graph, row['one'], row['two'])))

In [125]:
def apply_ew(row, graph):
    """
        Funcion que devuelve el peso del enlace en cada par
    """
    fst = row['one']
    snd = row['two']
    
    weight = graph.get_edge_data(fst, snd)
    
    # print(weight)
    
    if weight == None: # devuelve 0 en caso de que no exista enlace
        return 0
    else: # si si existe, devuelve el peso
        return weight['weight']

In [126]:
def apply_jn(row, graph):
    """
        Funcion que devuelve el numero de vecinos en comun de esos dos nodos
    """
    values_jn = nx.jaccard_coefficient(graph, [(row['one'], row['two'])])
    
    value_jn = 0
    for u, v, p in values_jn:
        value_jn = p # saco el valor
        
    return value_jn

In [127]:
def apply_pa(row, graph):
    """
        Funcion que devuelve el valor de preferential attachment
    """
    values_pa = nx.preferential_attachment(graph, [(row['one'], row['two'])])
    
    value_pa = 0
    for u, v, p in values_pa:
        value_pa = p # saco el valor
        
    return value_pa


In [128]:
def apply_waa(row, graph):
    """
        Funcion que devuelve para cada par de nodos, el valor de weighted adar adamic
    """
    
    # primero tengo que calcular los common neighbors de ambos items
    cn_list = nx.common_neighbors(graph, row['one'], row['two'])
    
    # ahora tengo que hacer el sumatorio del valor para cada elemento de cn_list
    value_waa = sum([((graph[row['one']][x]['weight'] + graph[row['two']][x]['weight']) / math.log(1 + graph.degree(x, weight="weight"), 10) )  for x in cn_list])    
    
    
    return value_waa

In [129]:
def apply_wcn(row, graph):
    """
        Funcion que devuelve el numero de vecinos en comun de esos dos nodos
    """
    cn_list = nx.common_neighbors(graph, row['one'], row['two'])
    
    value_wcn = sum([graph[row['one']][x]['weight'] + graph[row['two']][x]['weight'] for x in cn_list])
    
    return value_wcn

In [130]:
def apply_wpa(row, graph):
    """
        Funcion que devuelve el valor de weighted preferential attachment
    """
    value_wpa = graph.degree(row["one"], weight="weight") * graph.degree(row["two"], weight="weight")
        
    return value_wpa

In [131]:
def create_measure_data(graph):

    """
        Function that builds a dataframe with all the combinations between the nodes.
        
    """
    
    # Ahora voy a construir un DataFrame que tenga dos columnas con todas las posibles combinaciones de problemas, y otra 
    # columna con el valor de la medida especificada para ese par de problemas
    fst_column = list()
    snd_column = list()
    for fst, snd in it.combinations(nodes, 2):
        fst_column.append(fst)
        snd_column.append(snd)

    d = {'one' : fst_column,
        'two' : snd_column}
    dataFrame_measure = pd.DataFrame(d)

    return dataFrame_measure

In [132]:
def apply_measure(dataFrame_measure, measure):
     
    """
        Function that builds a dataframe with all the similarity values for two nodes.
        
    """
    # Aplico la funcion a cada fila
    if measure == 'aa':
        print("AA")
        dataFrame_measure['aa'] = dataFrame_measure.apply (lambda row: apply_aa(row, graph), axis=1)
    elif measure == 'cn':
        print("CN")
        dataFrame_measure['cn'] = dataFrame_measure.apply (lambda row: apply_cn(row, graph), axis=1)
    elif measure == 'ew':
        print("EW")
        dataFrame_measure['ew'] = dataFrame_measure.apply (lambda row: apply_ew(row, graph), axis=1)
    elif measure == 'jn':    
        print("JN")
        dataFrame_measure['jn'] = dataFrame_measure.apply (lambda row: apply_jn(row, graph), axis=1)
    elif measure == 'pa':    
        print("PA")
        dataFrame_measure['pa'] = dataFrame_measure.apply (lambda row: apply_pa(row, graph), axis=1)

    elif measure == 'waa':
        print("WAA")
        dataFrame_measure['waa'] = dataFrame_measure.apply (lambda row: apply_waa(row, graph), axis=1)
    elif measure == 'wcn':    
        print("WCN")
        dataFrame_measure['wcn'] = dataFrame_measure.apply (lambda row: apply_wcn(row, graph), axis=1)
    elif measure == 'wpa':    
        print("WPA")
        dataFrame_measure['wpa'] = dataFrame_measure.apply (lambda row: apply_wpa(row, graph), axis=1)
        
    return dataFrame_measure

In [133]:
def getrecommendations(row, measure_df, measure):
    """
        Funcion que devuelve la lista de k mejores problemas para el usuario
    """
    
    # primero saco los dos dataframes con problemas que se pueden recomendar
    column_result_one = measure_df[measure_df['one'] == row['item']]
    column_result_two = measure_df[measure_df['two'] == row['item']]
    
    
    tmp1 = column_result_two['two'].tolist()
    tmp2 = column_result_two['one'].tolist()
    tmp3 = column_result_two[measure].tolist()
   
    # creo un nuevo df
    df_tmp = pd.DataFrame({'one':tmp1, 'two':tmp2, measure: tmp3})
    
    frames = [column_result_one, df_tmp]
    
    # concateno los resultados
    column_result_tmp = pd.concat(frames, sort=True)
    
    # ordeno los problemas que se pueden recomendar
    column_result_tmp2 = column_result_tmp.sort_values(measure, ascending=False)
     
    tmp1 = column_result_tmp2['one'].tolist()
    tmp2 = column_result_tmp2['two'].tolist()
    tmp3 = column_result_tmp2[measure].tolist()
    
    # creo un nuevo df     
    column_result = pd.DataFrame({'one':tmp1, 'two':tmp2, measure: tmp3})
    
    return (column_result['two'].tolist())

In [134]:
def apply_getrecommendations(df_new, measure_df, measure):
    """
    Function to generate a new column with the list of recommendations for each user
    """
    df_new['list_recommendations'] = df_new.apply(lambda row: getrecommendations(row, measure_df, measure), axis=1)
    #print("Nueva lista recommendations")
    #print(df_new['list_recommendations'])
    #print(df_new)
    return df_new.copy()

In [135]:
def intersection(l1, l2): 
    linter = [e for e in l1 if e in l2] 
    return linter 

In [136]:
def getSimilarItems(item, user, df_measure, df_users):
    # saco la lista de items mas similares a item
    lista = df_measure[df_measure['item'] == item]['list_recommendations'].values 
    
    sim_items_list = list()
    
    if (len(lista) != 0):
        sim_items_list = lista[0]
    
    list_items_user = list()
    
    # ahora hago el filtro del usuario: solo me quedo con aquellos items con los que haya interactuado el usuario
    # saco la lista de items con los que ha interactuado el usuario
    if user in df_users:
        list_items_user = df_users[user]
    
    # ahora, de los items similares, me quedo con los que ha hecho el usuario, manteniendo el orden, ya que 
    # los primeros son los mas similares
    sim_items_user = intersection(sim_items_list, list_items_user)
    
    return sim_items_user

In [137]:
def getKsim(sim_items, k):
    """
        Funcion que saca las k mejores recomendaciones para el usuario
        Lo que hace es coger los primeros k valores de la lista de recomendaciones
    """
    if len(sim_items) == 0:
        return []
    else:
        return sim_items[:k]

In [138]:
def getSimilar(predicted_df, dataframe_measures, measure, df_users):
    """
         Calculo los items similares para una medida en concreto para un item en concreto, solo teneindo en cuenta aquellos
         items con los que ha interactuado el usuario en el conjunto de entrenamiento.
    """
    measure_idx = MEASURES.index(measure)
    print(predicted_df)
    predicted_df['sim_items'] = predicted_df.apply(lambda row: getSimilarItems(int(row['item']), row['user'], dataframe_measures[measure_idx], df_users), axis=1)
    
    
    # Crear una columna de 1 a 10 k
    for k in range(1,K+1):
        name = 'sim_items_' + str(k)
        predicted_df[name] = predicted_df.apply(lambda row: getKsim(row['sim_items'], k), axis=1)
    
    predicted_df_copy = predicted_df.drop(['sim_items'], axis=1)
    
    return predicted_df_copy
    

In [139]:
def avgPredRating(user, sim_items, df_measure):
    """
        Devuelve la media de los ratings predichos de los items similares
    """
    # si no hay items 
    if len(sim_items) == 0:
        return 0
    else:
        pred_ratings = list()
        # para todos los elementos de la lista de items similares, calculo su rating real en el conjunto de entrenamiento
        for elem in sim_items:
            df_measure_user = df_measure[df_measure['user'] == user]
            df_measure_user_item = df_measure_user[df_measure_user['item'] == elem]
            
            # si el user-item no esta en el conjunto de entrenamiento, pongo que su rating es cero
            if df_measure_user_item.empty:
                pred_ratings.append(0)
            else:
                pred_ratings.append(df_measure_user_item['rating'].values[0])
                
        # devuelvo la media de los valores de la lista
        return sum(pred_ratings) / len(pred_ratings)

In [140]:
def getPredRatings(dataframe_similar_items, measure, training_set):
    """
        Funcion para obtener la media de ratings predichos para una lista de items similares
    """
    # saco el dataframe para la medida seleccionada
    measure_idx = MEASURES.index(measure)
    df_measure = dataframe_similar_items[measure_idx]
    
    # de 1 a 10 similar items
    for k in range(1,K+1):
        name = 'avg_rating_' + str(k)
        name_sim = 'sim_items_' + str(k)
        # genero la media de los ratings reales en el conjunto de entrenamiento para los items similares
        df_measure[name] = df_measure.apply(lambda row: avgPredRating(int(row['user']), row[name_sim], training_set), axis=1)
        
        
    return df_measure

In [141]:
def getDifferenceRow(predicted, avg):
    return pow(abs(predicted - avg),2)

In [142]:
def getDiffs(dataframe_similar_items, measure):
    """
        Obtenemos tambien las diferencias entre el rating predicho del item de evaluacion y el avg de los items similares
    """
    # saco el dataframe para la medida seleccionada
    measure_idx = MEASURES.index(measure)
    df_measure = dataframe_similar_items[measure_idx]
    
    # de 1 a 10 similar items
    for k in range(1,K+1):
        name_dif = 'diff_' + str(k)
        name_avg = 'avg_rating_' + str(k)
        name_sim = 'sim_items_' + str(k)
        
        # elimino la columna con los items similares, ya que ya no lo necesitamos
        df_measure.drop([name_sim], axis=1, inplace=True)
        

        # genero la media de los ratings reales en el conjunto de entrenamiento para los items similares
        df_measure[name_dif] = df_measure.apply(lambda row: getDifferenceRow(row['predicted'], row[name_avg]), axis=1)
        
    return df_measure

In [143]:
def getRMSE(df_diffs, measure):
    """
        Funcion que obtiene la media de las diferencias: RMSE para k = 1..10 y medidas de similitud = MEASURES
    """
    # saco el dataframe para la medida seleccionada
    measure_idx = MEASURES.index(measure)
    df_diff_measure = df_diffs[measure_idx]
    
    rmse_avg = list()
    
    # de 1 a 10 similar items
    for k in range(1,K+1):
        name_dif = 'diff_' + str(k)
        # concateno a la lista el valor de rmse para k
        rmse_avg.append(math.sqrt(df_diff_measure[name_dif].mean()))
        
    return rmse_avg

In [144]:
############ MAIN

In [145]:
# I want to split the dataset into training set and evaluation set
# I am going to use the first half of ratings to build the training set and the second one to build the evaluation set

# I build the training set
training_set = pd.read_csv('trainset.csv')

# I delete the ratings with values < 4 (movies with ratings < 4 are not interested to users)
#training_set = training_set[training_set['rating'] >= RATING_THRESHOLD]

training_set.columns = ['user', 'item', 'rating', 'timestamp']

print(training_set)
training_set[training_set['user'] == 545]

       user  item  rating   timestamp
0         1     1     4.0   964982703
1         1    47     5.0   964983815
2         1    50     5.0   964982931
3         1    70     3.0   964982400
4         1   101     5.0   964980868
5         1   110     4.0   964982176
6         1   151     5.0   964984041
7         1   163     5.0   964983650
8         1   231     5.0   964981179
9         1   235     4.0   964980908
10        1   260     5.0   964981680
11        1   296     3.0   964982967
12        1   356     4.0   964980962
13        1   367     4.0   964981710
14        1   441     4.0   964980868
15        1   457     5.0   964981909
16        1   480     4.0   964982346
17        1   500     3.0   964981208
18        1   553     5.0   964984153
19        1   590     4.0   964982546
20        1   592     4.0   964982271
21        1   593     4.0   964983793
22        1   596     5.0   964982838
23        2   318     3.0  1445714835
24        4    21     3.0   986935199
25        4 

Unnamed: 0,user,item,rating,timestamp


In [146]:
# I build the evaluation_set
#evaluation_set = df[HALF_DATASET_SIZE:]

#evaluation_set = evaluation_set[evaluation_set['rating'] != 0.5]

#evaluation_set = pd.read_csv('testset.csv')
evaluation_set = pd.read_csv('testset_estratificado.csv')

evaluation_set.columns = ['user', 'item', 'rating', 'timestamp']

print(evaluation_set)

     user  item  rating   timestamp
0      58   344     1.0   847718434
1      68   357     1.0  1240092554
2      68   420     1.0  1158533188
3      71    17     1.0   864737933
4      81   367     1.0   845299844
5      91   374     1.0  1112716895
6     100   235     1.0  1100183797
7     110   374     1.0  1175329527
8     181   191     1.0   845470737
9     191    39     1.0   829760897
10    223    34     1.0  1226209953
11    262    48     1.0   840305912
12    294   255     1.0   966597066
13    297   170     1.0   900875950
14    307   434     1.0  1227538408
15    308   223     1.0  1421374410
16    314   327     1.0   834428752
17    381   586     1.0  1164877371
18    428   165     1.0  1111489756
19    431   158     1.0  1267051813
20    446   168     1.0   843839441
21    474   122     1.0   974669063
22    485   231     1.0   837943293
23    489   160     1.0  1334171163
24    494   344     1.0  1000384113
25    500   471     1.0  1005528017
26    536   357     1.0   83

In [147]:
# sin estratificar
#predicted_df = pd.read_csv('predicted_values.csv')

#predicted_df.columns = ['user', 'item', 'rating', 'predicted']

#print(predicted_df)

In [148]:
# Estratificado
predicted_df_original = pd.read_csv('predicted_values_clean.csv', sep=';')

predicted_df_original.columns = ['user', 'item', 'rating', 'predicted']

predicted_df = predicted_df_original.copy()
print(predicted_df)


     user  item  rating  predicted
0      71    17     1.0   2.265318
1      14    19     1.0   2.776735
2      58    19     1.0   3.559457
3     604    19     1.0   2.909599
4     223    34     1.0   3.162531
5     191    39     1.0   3.867352
6     262    48     1.0   2.997984
7     603   110     1.0   3.829871
8     474   122     1.0   3.541002
9     431   158     1.0   4.003648
10    489   160     1.0   2.646791
11    428   165     1.0   1.867516
12    446   168     1.0   3.043357
13    297   170     1.0   3.500043
14    181   191     1.0   3.519124
15    584   193     1.0   3.473004
16    308   223     1.0   2.169618
17    485   231     1.0   3.672982
18    100   235     1.0   4.003491
19    294   255     1.0   2.265807
20    314   327     1.0   2.839751
21     58   344     1.0   2.908802
22    494   344     1.0   2.333342
23    584   344     1.0   3.872642
24     68   357     1.0   3.192768
25    536   357     1.0   3.318538
26     81   367     1.0   3.442622
27     91   374     

In [149]:
# I get the list of nodes
nodes = training_set.item.unique()

print(len(nodes))
print(nodes)

164
[  1  47  50  70 101 110 151 163 231 235 260 296 356 367 441 457 480 500
 553 590 592 593 596 318  21  45 125 215 247 348 509 588 599  34  36  39
 150 153 253 261 266 344 349 357 364 380 515 527 594 597  10  11  15  16
  17  19  22  25  60  61  62  79  86  92  93  95 100 104 112 140 145 158
 160 161 165 168 170 180 185 191 196 208 224 225 254 292 303 315 317 327
 329 330 337 350 352 353 366 370 374 377 382 383 405 412 434 454 485 493
 494 497 508 510 516 546 587 589 586  41 223  44 105 193 376 477 355 111
 204 541 555  48 328 338 379 420 610  23 246 471 562 122 144  57 259 519
 532  18  69 387 482 255 523  20  14 534  81 533 175 233 188 132 393 294
 476 521]


In [150]:
# I create a dictionary: keys are the items, and values are the list of users that are interacted with this item
grouped = training_set.groupby('item')['user'].apply(list)

#print(grouped)

In [151]:
# I create the links with the suitable format for nx
links = createLinks(grouped, nodes, WEIGHT_THRESHOLD)

print(len(links))



7958


In [152]:
# I create the graph
graph = create_graph_nx(nodes, links)

In [153]:
grouped_user = training_set.groupby('user')['item'].apply(list)

# diccionario que va a contener como key el user, como value, los items con los que ha interactuado el user
df_users = {}

for i,j in zip(grouped_user.index.tolist(), grouped_user.values.tolist()):
    df_users[i] = j 


In [154]:
d = {'item': list(set(training_set['item'].tolist()))}
df_similar = pd.DataFrame(data=d)

In [155]:
# I create a dataframe that keeps the similarity values from the measures

# I add columns with the similarity values for each row
measure_df = create_measure_data(graph)

measure_df = apply_measure(measure_df, 'cn') 

#print(measure_df)


CN


In [156]:
measure_df = apply_measure(measure_df, 'ew') 

EW


In [157]:
measure_df = apply_measure(measure_df, 'aa')

AA


In [158]:
measure_df = apply_measure(measure_df, 'jn')

JN


In [159]:
measure_df = apply_measure(measure_df, 'pa')

PA


In [160]:
measure_df = apply_measure(measure_df, 'wcn')

WCN


In [161]:
measure_df = apply_measure(measure_df, 'waa')

WAA


In [162]:
measure_df = apply_measure(measure_df, 'wpa')

WPA


In [163]:
#print(measure_df)
#measure_df.to_csv("C:/hlocal/measure_df_items.csv")

In [164]:
# I build a list of list: rows for k, columns for measures
# k --> 1 a 10 (0 a 9)
# measures --> cn, ew, aa, jn, pa, wcn, waa, wpa (0 a 7)
# each cell has a dataframe for k = i and measure = j

dataframe_measures = list()

# creo una lista en cada posicion, en esa lista vamos a guardar los valores para las measures 
# obtenemos las recomendaciones para cada celda
# creo una copia par que no se modifiquen las referencias de los dataframes
dataframe_measures = [apply_getrecommendations(df_similar, measure_df, measure).copy() for measure in MEASURES] 

# en esta estructura tengo:
# por cada fila: las diferentes metricas de similitud
# guardo cuales son los items mas similares a uno dado 
print(dataframe_measures)




[     item                               list_recommendations
0       1  [296.0, 380.0, 588.0, 110.0, 457.0, 318.0, 593...
1     515  [380, 296, 318, 457, 593, 356, 480, 110, 587, ...
2     516  [380, 296, 318, 457, 356, 593, 480, 377, 110, ...
3     519  [318, 296, 380, 457, 356, 593, 480, 50, 260, 5...
4     521  [1, 376, 434, 454, 485, 493, 494, 497, 508, 51...
5      10  [318, 380, 377, 480, 457, 356, 296, 593, 165, ...
6      11  [380, 318, 296, 457, 356, 593, 480, 260, 50, 5...
7     523  [20, 497, 330, 337, 350, 352, 353, 366, 370, 3...
8      14  [380, 318, 296, 457, 593, 356, 480, 587, 50, 1...
9     527  [318, 380, 457, 593, 356, 260, 587, 150, 592, ...
10     15  [380, 296, 356, 593, 480, 150, 110, 50, 377, 1...
11     16  [380, 593, 377, 150, 318, 457, 356, 296, 260, ...
12     17  [318, 296, 380, 356, 587, 597, 590, 480, 50, 5...
13     19  [296, 208, 380, 593, 356, 480, 377, 50, 110, 5...
14    532  [266, 596, 485, 104, 95, 497, 546, 62, 25, 22,...
15     21  [380, 356, 1

In [165]:
### EVALUATION ########################################################### 

In [166]:
print(predicted_df)

     user  item  rating  predicted
0      71    17     1.0   2.265318
1      14    19     1.0   2.776735
2      58    19     1.0   3.559457
3     604    19     1.0   2.909599
4     223    34     1.0   3.162531
5     191    39     1.0   3.867352
6     262    48     1.0   2.997984
7     603   110     1.0   3.829871
8     474   122     1.0   3.541002
9     431   158     1.0   4.003648
10    489   160     1.0   2.646791
11    428   165     1.0   1.867516
12    446   168     1.0   3.043357
13    297   170     1.0   3.500043
14    181   191     1.0   3.519124
15    584   193     1.0   3.473004
16    308   223     1.0   2.169618
17    485   231     1.0   3.672982
18    100   235     1.0   4.003491
19    294   255     1.0   2.265807
20    314   327     1.0   2.839751
21     58   344     1.0   2.908802
22    494   344     1.0   2.333342
23    584   344     1.0   3.872642
24     68   357     1.0   3.192768
25    536   357     1.0   3.318538
26     81   367     1.0   3.442622
27     91   374     

In [167]:
# devolvemos una lista de dataframes en la que se incluyen los items similares
# .copy()
dataframe_similar_items = [getSimilar(predicted_df, dataframe_measures, measure, df_users) for measure in MEASURES]
dataframe_similar_items

     user  item  rating  predicted
0      71    17     1.0   2.265318
1      14    19     1.0   2.776735
2      58    19     1.0   3.559457
3     604    19     1.0   2.909599
4     223    34     1.0   3.162531
5     191    39     1.0   3.867352
6     262    48     1.0   2.997984
7     603   110     1.0   3.829871
8     474   122     1.0   3.541002
9     431   158     1.0   4.003648
10    489   160     1.0   2.646791
11    428   165     1.0   1.867516
12    446   168     1.0   3.043357
13    297   170     1.0   3.500043
14    181   191     1.0   3.519124
15    584   193     1.0   3.473004
16    308   223     1.0   2.169618
17    485   231     1.0   3.672982
18    100   235     1.0   4.003491
19    294   255     1.0   2.265807
20    314   327     1.0   2.839751
21     58   344     1.0   2.908802
22    494   344     1.0   2.333342
23    584   344     1.0   3.872642
24     68   357     1.0   3.192768
25    536   357     1.0   3.318538
26     81   367     1.0   3.442622
27     91   374     

     user  item  rating  predicted  \
0      71    17     1.0   2.265318   
1      14    19     1.0   2.776735   
2      58    19     1.0   3.559457   
3     604    19     1.0   2.909599   
4     223    34     1.0   3.162531   
5     191    39     1.0   3.867352   
6     262    48     1.0   2.997984   
7     603   110     1.0   3.829871   
8     474   122     1.0   3.541002   
9     431   158     1.0   4.003648   
10    489   160     1.0   2.646791   
11    428   165     1.0   1.867516   
12    446   168     1.0   3.043357   
13    297   170     1.0   3.500043   
14    181   191     1.0   3.519124   
15    584   193     1.0   3.473004   
16    308   223     1.0   2.169618   
17    485   231     1.0   3.672982   
18    100   235     1.0   4.003491   
19    294   255     1.0   2.265807   
20    314   327     1.0   2.839751   
21     58   344     1.0   2.908802   
22    494   344     1.0   2.333342   
23    584   344     1.0   3.872642   
24     68   357     1.0   3.192768   
25    536   

     user  item  rating  predicted  \
0      71    17     1.0   2.265318   
1      14    19     1.0   2.776735   
2      58    19     1.0   3.559457   
3     604    19     1.0   2.909599   
4     223    34     1.0   3.162531   
5     191    39     1.0   3.867352   
6     262    48     1.0   2.997984   
7     603   110     1.0   3.829871   
8     474   122     1.0   3.541002   
9     431   158     1.0   4.003648   
10    489   160     1.0   2.646791   
11    428   165     1.0   1.867516   
12    446   168     1.0   3.043357   
13    297   170     1.0   3.500043   
14    181   191     1.0   3.519124   
15    584   193     1.0   3.473004   
16    308   223     1.0   2.169618   
17    485   231     1.0   3.672982   
18    100   235     1.0   4.003491   
19    294   255     1.0   2.265807   
20    314   327     1.0   2.839751   
21     58   344     1.0   2.908802   
22    494   344     1.0   2.333342   
23    584   344     1.0   3.872642   
24     68   357     1.0   3.192768   
25    536   

     user  item  rating  predicted  \
0      71    17     1.0   2.265318   
1      14    19     1.0   2.776735   
2      58    19     1.0   3.559457   
3     604    19     1.0   2.909599   
4     223    34     1.0   3.162531   
5     191    39     1.0   3.867352   
6     262    48     1.0   2.997984   
7     603   110     1.0   3.829871   
8     474   122     1.0   3.541002   
9     431   158     1.0   4.003648   
10    489   160     1.0   2.646791   
11    428   165     1.0   1.867516   
12    446   168     1.0   3.043357   
13    297   170     1.0   3.500043   
14    181   191     1.0   3.519124   
15    584   193     1.0   3.473004   
16    308   223     1.0   2.169618   
17    485   231     1.0   3.672982   
18    100   235     1.0   4.003491   
19    294   255     1.0   2.265807   
20    314   327     1.0   2.839751   
21     58   344     1.0   2.908802   
22    494   344     1.0   2.333342   
23    584   344     1.0   3.872642   
24     68   357     1.0   3.192768   
25    536   

[     user  item  rating  predicted sim_items_1 sim_items_2      sim_items_3  \
 0      71    17     1.0   2.265318       [260]  [260, 589]   [260, 589, 62]   
 1      14    19     1.0   2.776735       [296]  [296, 593]  [296, 593, 356]   
 2      58    19     1.0   3.559457       [296]  [296, 208]  [296, 208, 380]   
 3     604    19     1.0   2.909599       [296]  [296, 208]  [296, 208, 380]   
 4     223    34     1.0   3.162531       [296]  [296, 457]  [296, 457, 110]   
 5     191    39     1.0   3.867352       [318]  [318, 296]  [318, 296, 380]   
 6     262    48     1.0   2.997984       [380]  [380, 296]  [380, 296, 318]   
 7     603   110     1.0   3.829871       [296]  [296, 380]  [296, 380, 593]   
 8     474   122     1.0   3.541002       [337]  [337, 317]  [337, 317, 261]   
 9     431   158     1.0   4.003648       [163]  [163, 485]  [163, 485, 555]   
 10    489   160     1.0   2.646791       [380]  [380, 296]  [380, 296, 356]   
 11    428   165     1.0   1.867516     

In [168]:
df_similar_items_rating = [getPredRatings(dataframe_similar_items, measure, training_set) for measure in MEASURES]
df_similar_items_rating

[     user  item  rating  predicted sim_items_1 sim_items_2      sim_items_3  \
 0      71    17     1.0   2.265318       [260]  [260, 589]   [260, 589, 62]   
 1      14    19     1.0   2.776735       [296]  [296, 593]  [296, 593, 356]   
 2      58    19     1.0   3.559457       [296]  [296, 208]  [296, 208, 380]   
 3     604    19     1.0   2.909599       [296]  [296, 208]  [296, 208, 380]   
 4     223    34     1.0   3.162531       [296]  [296, 457]  [296, 457, 110]   
 5     191    39     1.0   3.867352       [318]  [318, 296]  [318, 296, 380]   
 6     262    48     1.0   2.997984       [380]  [380, 296]  [380, 296, 318]   
 7     603   110     1.0   3.829871       [296]  [296, 380]  [296, 380, 593]   
 8     474   122     1.0   3.541002       [337]  [337, 317]  [337, 317, 261]   
 9     431   158     1.0   4.003648       [163]  [163, 485]  [163, 485, 555]   
 10    489   160     1.0   2.646791       [380]  [380, 296]  [380, 296, 356]   
 11    428   165     1.0   1.867516     

In [169]:
df_diffs = [getDiffs(df_similar_items_rating, measure) for measure in MEASURES]
df_diffs

[     user  item  rating  predicted  avg_rating_1  avg_rating_2  avg_rating_3  \
 0      71    17     1.0   2.265318           3.0          3.00      3.333333   
 1      14    19     1.0   2.776735           3.0          3.50      3.666667   
 2      58    19     1.0   3.559457           5.0          4.50      4.000000   
 3     604    19     1.0   2.909599           5.0          4.00      3.666667   
 4     223    34     1.0   3.162531           3.0          3.00      3.166667   
 5     191    39     1.0   3.867352           4.0          4.50      4.000000   
 6     262    48     1.0   2.997984           3.0          3.00      3.000000   
 7     603   110     1.0   3.829871           5.0          3.50      4.000000   
 8     474   122     1.0   3.541002           4.0          3.00      3.333333   
 9     431   158     1.0   4.003648           3.0          2.75      3.500000   
 10    489   160     1.0   2.646791           3.5          3.75      3.833333   
 11    428   165     1.0   1

In [170]:
RMSE = [getRMSE(df_diffs, measure) for measure in MEASURES]
RMSE

[[1.1455100214614298,
  0.9126299910845912,
  0.8079986784903905,
  0.7741304101359513,
  0.736024996733246,
  0.7022440820014434,
  0.6689466341835859,
  0.6555598807939002,
  0.6415182046747796,
  0.6312181496587426],
 [1.079920643397708,
  0.8605995018826581,
  0.8026507775207822,
  0.7238359323981435,
  0.6894456030660177,
  0.6780299692980655,
  0.6498463794399397,
  0.6414014485370758,
  0.6329711737835705,
  0.6317422766080596],
 [1.0599004741232445,
  0.9053369044473057,
  0.8253904452333939,
  0.7849637050415298,
  0.7510689081663086,
  0.7265057696668582,
  0.7045060961537878,
  0.6897209555961005,
  0.6670372329943164,
  0.6433423695936646],
 [1.1230686211236875,
  0.8943818097045688,
  0.8001033667437871,
  0.7659707930723384,
  0.7416381932861988,
  0.7227827564131781,
  0.6905705200851433,
  0.6738134345880337,
  0.6684943143080673,
  0.6634385663614099],
 [1.1219914836460603,
  0.9364523715649365,
  0.8279220850051903,
  0.7884921355738784,
  0.758797977572407,
  0.73059

In [171]:
df_RMSE = pd.DataFrame(RMSE)
df_RMSE.columns= [1,2,3,4,5,6,7,8,9,10]
df_RMSE['sim_measure'] = MEASURES
df_RMSE

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,sim_measure
0,1.14551,0.91263,0.807999,0.77413,0.736025,0.702244,0.668947,0.65556,0.641518,0.631218,aa
1,1.079921,0.8606,0.802651,0.723836,0.689446,0.67803,0.649846,0.641401,0.632971,0.631742,cn
2,1.0599,0.905337,0.82539,0.784964,0.751069,0.726506,0.704506,0.689721,0.667037,0.643342,ew
3,1.123069,0.894382,0.800103,0.765971,0.741638,0.722783,0.690571,0.673813,0.668494,0.663439,jn
4,1.121991,0.936452,0.827922,0.788492,0.758798,0.730595,0.71186,0.688156,0.674539,0.654649,pa
5,1.027918,0.944305,0.894113,0.808197,0.776613,0.745556,0.722397,0.702418,0.682965,0.664361,waa
6,1.027918,0.944305,0.893774,0.80621,0.779181,0.745037,0.723923,0.704094,0.685255,0.663497,wcn
7,1.027918,0.952338,0.895984,0.811011,0.780805,0.747292,0.721538,0.704158,0.686918,0.66417,wpa


In [172]:
df_RMSE.to_csv('RMSE_item_graph_no_filtro4_estratificado.csv', index=False)