In [1]:
# LIBRARIES

import pandas as pd
import numpy as np
import itertools as it
from datetime import datetime
import networkx as nx
import math
# Documentacion de la libreria: http://networkx.readthedocs.io/en/networkx-1.11/

from operator import itemgetter
from itertools import groupby

In [2]:
# CONSTANTS 

#DATASET_SIZE = 100000
#HALF_DATASET_SIZE = int(DATASET_SIZE / 2)
RATING_THRESHOLD = 4
WEIGHT_THRESHOLD = 5
K = 10
MEASURES = ['aa', 'cn', 'ew', 'jn', 'pa', 'waa', 'wcn', 'wpa']

In [3]:
# FUNCTIONS AND SUBPROGRAMS

In [4]:
def compareNodes(f_list, s_list):
    """
        Function that returns the number of users that have interact with both items
        Funcion que devuelve el numero de usuarios que han interactuado con ambos items
    """
    peso = len(np.intersect1d(f_list, s_list))
    
    return peso
    
def createLinks(prob_us_set, nodos, threshold):
    """
        Function that creates graph links with the information about the set. The weight has to be grater or equal to threshold.
        
        Funcion que crea los enlaces del grafo a partir de la informacion contenida en el conjunto que se le
        pasa a la funcion. El peso tiene que ser mayor o igual al umbral.
        
        Format of links list -> [(Node1, Node2, weight), ......]
    """
    resultado = list() 
    
    # hago todas las posibles combinaciones de problemas
    for fst, snd in it.combinations(nodos, 2):
        # obtengo el peso pasando la lista de usuarios que ha hecho cada problema
        peso = compareNodes(prob_us_set[fst], prob_us_set[snd])
        if peso >= threshold:
            resultado.append((fst, snd, peso))
            
            
            
    return resultado

In [5]:
def create_graph_nx(list_nodes, list_links):
    """
        Function that creates a graph with the format from NetworkX 
        
        Funcion que crea un grafo de tipo Graph de la libreria NetworkX
        Construccion del grafo: http://networkx.readthedocs.io/en/networkx-1.11/tutorial/tutorial.html#what-to-use-as-nodes-and-edges
    """
    grafo = nx.Graph() # creo la variable grafo

    # incluyo los nodos del grafo 
    grafo.add_nodes_from(list_nodes)

    # se incluyen las tuplas de enlaces con el peso del enlace
    # es una lista de la forma [(Nodo1, Nodo2, peso), ......]
    grafo.add_weighted_edges_from(list_links)

    return grafo

In [6]:
def apply_aa(row, graph):
    """
        Funcion que devuelve para cada par de nodos, el sumatorio de 1/log(N(z)), siendo N(z) el grado del nodo z para todo z 
        perteneciente al conjunto de nodos en comun de ese par de nodos
    """
    
    # obtengo un iterador de un solo elemento que tiene en la tercera posicion el valor de AA para el par de nodos
    value = nx.adamic_adar_index(graph, [(row['one'], row['two'])])
    
    value_aa = 0
    for u, v, p in value:
        # itero el iterador, guardando el valor de adar adamic
        value_aa = p
    
    return value_aa

In [7]:
def apply_cn(row, graph):
    """
        Funcion que devuelve el numero de vecinos en comun de esos dos nodos
    """
    return len(list(nx.common_neighbors(graph, row['one'], row['two'])))

In [8]:
def apply_ew(row, graph):
    """
        Funcion que devuelve el peso del enlace en cada par
    """
    fst = row['one']
    snd = row['two']
    
    weight = graph.get_edge_data(fst, snd)
    
    # print(weight)
    
    if weight == None: # devuelve 0 en caso de que no exista enlace
        return 0
    else: # si si existe, devuelve el peso
        return weight['weight']

In [9]:
def apply_jn(row, graph):
    """
        Funcion que devuelve el numero de vecinos en comun de esos dos nodos
    """
    values_jn = nx.jaccard_coefficient(graph, [(row['one'], row['two'])])
    
    value_jn = 0
    for u, v, p in values_jn:
        value_jn = p # saco el valor
        
    return value_jn

In [10]:
def apply_pa(row, graph):
    """
        Funcion que devuelve el valor de preferential attachment
    """
    values_pa = nx.preferential_attachment(graph, [(row['one'], row['two'])])
    
    value_pa = 0
    for u, v, p in values_pa:
        value_pa = p # saco el valor
        
    return value_pa


In [11]:
def apply_waa(row, graph):
    """
        Funcion que devuelve para cada par de nodos, el valor de weighted adar adamic
    """
    
    # primero tengo que calcular los common neighbors de ambos items
    cn_list = nx.common_neighbors(graph, row['one'], row['two'])
    
    # ahora tengo que hacer el sumatorio del valor para cada elemento de cn_list
    value_waa = sum([((graph[row['one']][x]['weight'] + graph[row['two']][x]['weight']) / math.log(1 + graph.degree(x, weight="weight"), 10) )  for x in cn_list])    
    
    
    return value_waa

In [12]:
def apply_wcn(row, graph):
    """
        Funcion que devuelve el numero de vecinos en comun de esos dos nodos
    """
    cn_list = nx.common_neighbors(graph, row['one'], row['two'])
    
    value_wcn = sum([graph[row['one']][x]['weight'] + graph[row['two']][x]['weight'] for x in cn_list])
    
    return value_wcn

In [13]:
def apply_wpa(row, graph):
    """
        Funcion que devuelve el valor de weighted preferential attachment
    """
    value_wpa = graph.degree(row["one"], weight="weight") * graph.degree(row["two"], weight="weight")
        
    return value_wpa

In [14]:
def create_measure_data(graph):

    """
        Function that builds a dataframe with all the combinations between the nodes.
        
    """
    
    # Ahora voy a construir un DataFrame que tenga dos columnas con todas las posibles combinaciones de problemas, y otra 
    # columna con el valor de la medida especificada para ese par de problemas
    fst_column = list()
    snd_column = list()
    for fst, snd in it.combinations(nodes, 2):
        fst_column.append(fst)
        snd_column.append(snd)

    d = {'one' : fst_column,
        'two' : snd_column}
    dataFrame_measure = pd.DataFrame(d)

    return dataFrame_measure

In [15]:
def apply_measure(dataFrame_measure, measure):
     
    """
        Function that builds a dataframe with all the similarity values for two nodes.
        
    """
    # Aplico la funcion a cada fila
    if measure == 'aa':
        # print("AA")
        dataFrame_measure['aa'] = dataFrame_measure.apply (lambda row: apply_aa(row, graph), axis=1)
    elif measure == 'cn':
        # print("CN")
        dataFrame_measure['cn'] = dataFrame_measure.apply (lambda row: apply_cn(row, graph), axis=1)
    elif measure == 'ew':
        # print("EW")
        dataFrame_measure['ew'] = dataFrame_measure.apply (lambda row: apply_ew(row, graph), axis=1)
    elif measure == 'jn':    
        # print("JN")
        dataFrame_measure['jn'] = dataFrame_measure.apply (lambda row: apply_jn(row, graph), axis=1)
    elif measure == 'pa':    
        # print("PA")
        dataFrame_measure['pa'] = dataFrame_measure.apply (lambda row: apply_pa(row, graph), axis=1)

    elif measure == 'waa':
        # print("WAA")
        dataFrame_measure['waa'] = dataFrame_measure.apply (lambda row: apply_waa(row, graph), axis=1)
    elif measure == 'wcn':    
        # print("WCN")
        dataFrame_measure['wcn'] = dataFrame_measure.apply (lambda row: apply_wcn(row, graph), axis=1)
    elif measure == 'wpa':    
        # print("WPA")
        dataFrame_measure['wpa'] = dataFrame_measure.apply (lambda row: apply_wpa(row, graph), axis=1)
        
    return dataFrame_measure

In [16]:
def getrecommendations(row, measure_df, measure):
    """
        Funcion que devuelve la lista de k mejores problemas para el usuario
    """
    
    # primero saco los dos dataframes con problemas que se pueden recomendar
    column_result_one = measure_df[measure_df['one'] == row['item']]
    column_result_two = measure_df[measure_df['two'] == row['item']]
    
    
    tmp1 = column_result_two['two'].tolist()
    tmp2 = column_result_two['one'].tolist()
    tmp3 = column_result_two[measure].tolist()
   
    # creo un nuevo df
    df_tmp = pd.DataFrame({'one':tmp1, 'two':tmp2, measure: tmp3})
    
    frames = [column_result_one, df_tmp]
    
    # concateno los resultados
    column_result_tmp = pd.concat(frames, sort=True)
    
    # ordeno los problemas que se pueden recomendar
    column_result_tmp2 = column_result_tmp.sort_values(measure, ascending=False)
     
    tmp1 = column_result_tmp2['one'].tolist()
    tmp2 = column_result_tmp2['two'].tolist()
    tmp3 = column_result_tmp2[measure].tolist()
    
    # creo un nuevo df     
    column_result = pd.DataFrame({'one':tmp1, 'two':tmp2, measure: tmp3})
    
    #column_result = column_result[column_result[measure] != 0]
    
    return (column_result['two'].tolist())

In [17]:
def apply_getrecommendations(df_new, measure_df, measure):
    """
    Function to generate a new column with the list of recommendations for each user
    """
    df_new['list_recommendations'] = df_new.apply(lambda row: getrecommendations(row, measure_df, measure), axis=1)
    #print("Nueva lista recommendations")
    #print(df_new['list_recommendations'])
    #print(df_new)
    return df_new.copy()

In [18]:
def intersection(l1, l2): 
    linter = [e for e in l1 if e in l2] 
    return linter 

In [19]:
def getSimilarItems(item, user, df_measure, df_users):
    # saco la lista de items mas similares a item
    lista = df_measure[df_measure['item'] == item]['list_recommendations'].values 
    
    sim_items_list = list()
    
    if (len(lista) != 0):
        sim_items_list = lista[0]
    
    list_items_user = list()
    
    # ahora hago el filtro del usuario: solo me quedo con aquellos items con los que haya interactuado el usuario
    # saco la lista de items con los que ha interactuado el usuario
    if user in df_users:
        list_items_user = df_users[user]
    
    # ahora, de los items similares, me quedo con los que ha hecho el usuario, manteniendo el orden, ya que 
    # los primeros son los mas similares
    sim_items_user = intersection(sim_items_list, list_items_user)
    
    return sim_items_user

In [20]:
def getKsim(sim_items, k):
    """
        Funcion que saca las k mejores recomendaciones para el usuario
        Lo que hace es coger los primeros k valores de la lista de recomendaciones
    """
    if len(sim_items) == 0:
        return []
    else:
        return sim_items[:k]

In [21]:
def getKSize(max_list):
    return len(max_list)

In [22]:
def getSimilar(predicted_df, dataframe_measures, measure, df_users):
    """
         Calculo los items similares para una medida en concreto para un item en concreto, solo teneindo en cuenta aquellos
         items con los que ha interactuado el usuario en el conjunto de entrenamiento.
    """
    measure_idx = MEASURES.index(measure)
    # print(predicted_df)
    predicted_df['sim_items'] = predicted_df.apply(lambda row: getSimilarItems(int(row['item']), row['user'], dataframe_measures[measure_idx], df_users), axis=1)
    
    
    # Crear una columna de 1 a 10 k
    for k in range(1,K+1):
        name = 'sim_items_' + str(k)
        predicted_df[name] = predicted_df.apply(lambda row: getKsim(row['sim_items'], k), axis=1)
    
    predicted_df['list_size'] = predicted_df.apply(lambda row: getKSize(row['sim_items_10']), axis=1)
    predicted_df_copy = predicted_df.drop(['sim_items'], axis=1)
    
    
    return predicted_df_copy
    

In [23]:
def avgPredRating(user, sim_items, df_measure):
    """
        Devuelve la media de los ratings predichos de los items similares
    """
    # si no hay items 
    if len(sim_items) == 0:
        return 0
    else:
        pred_ratings = list()
        # para todos los elementos de la lista de items similares, calculo su rating real en el conjunto de entrenamiento
        for elem in sim_items:
            df_measure_user = df_measure[df_measure['user'] == user]
            df_measure_user_item = df_measure_user[df_measure_user['item'] == elem]
            
            # si el user-item no esta en el conjunto de entrenamiento, pongo que su rating es cero
            if df_measure_user_item.empty:
                pred_ratings.append(0)
            else:
                pred_ratings.append(df_measure_user_item['rating'].values[0])
                
        # devuelvo la media de los valores de la lista
        return sum(pred_ratings) / len(pred_ratings)

In [24]:
def getPredRatings(dataframe_similar_items, measure, training_set):
    """
        Funcion para obtener la media de ratings predichos para una lista de items similares
    """
    # saco el dataframe para la medida seleccionada
    measure_idx = MEASURES.index(measure)
    df_measure = dataframe_similar_items[measure_idx]
    
    # de 1 a 10 similar items
    for k in range(1,K+1):
        name = 'avg_rating_' + str(k)
        name_sim = 'sim_items_' + str(k)
        # genero la media de los ratings reales en el conjunto de entrenamiento para los items similares
        df_measure[name] = df_measure.apply(lambda row: avgPredRating(int(row['user']), row[name_sim], training_set), axis=1)
        
        
    return df_measure

In [25]:
def getDifferenceRow(predicted, avg):
    return pow(abs(predicted - avg),2)

In [26]:
def getDiffs(dataframe_similar_items, measure):
    """
        Obtenemos tambien las diferencias entre el rating predicho del item de evaluacion y el avg de los items similares
    """
    # saco el dataframe para la medida seleccionada
    measure_idx = MEASURES.index(measure)
    df_measure = dataframe_similar_items[measure_idx]
    
    # de 1 a 10 similar items
    for k in range(1,K+1):
        name_dif = 'diff_' + str(k)
        name_avg = 'avg_rating_' + str(k)
        name_sim = 'sim_items_' + str(k)
        
        # elimino la columna con los items similares, ya que ya no lo necesitamos
        df_measure.drop([name_sim], axis=1, inplace=True)
        

        # genero la media de los ratings reales en el conjunto de entrenamiento para los items similares
        df_measure[name_dif] = df_measure.apply(lambda row: getDifferenceRow(row['predicted'], row[name_avg]), axis=1)
        
    return df_measure

In [27]:
def mean_list(l):
    return sum(l) / float(len(l))

In [28]:
def getRMSE(df_diffs, measure):
    """
        Funcion que obtiene la media de las diferencias: RMSE para k = 1..10 y medidas de similitud = MEASURES
    """
    # saco el dataframe para la medida seleccionada
    measure_idx = MEASURES.index(measure)
    df_diff_measure = df_diffs[measure_idx]
    lista_sizes = df_diff_measure['list_size'].tolist()
    
    len_lista_sizes = len(lista_sizes)
    
    rmse_avg = list()
    
    # de 1 a 10 similar items
    for k in range(1,K+1):
        name_dif = 'diff_' + str(k)        
        
        # aqui no tengo que coger las diferencias de los usuarios que no tienen una lista con tamaño >= k
        lista_diffs = df_diff_measure[name_dif].tolist()
        tmp = list()
        for i in range(len_lista_sizes):
            if(lista_sizes[i] >= k):
                tmp.append(lista_diffs[i])
        
        # concateno a la lista el valor de rmse para k
        rmse_avg.append(math.sqrt(mean_list(tmp)))
        
    return rmse_avg

In [29]:
def make_mask(total_elems, num_elems):
    random_values = np.random.rand(total_elems)
    
    if total_elems == num_elems:
        return np.full((35), True)
    else:
        border = np.sort(random_values)[num_elems]
        mask = random_values < border
        return mask

In [30]:
# Función para seleccionar aleatoriamente el test estratificado
def get_estratificate_populate(dataframe, measure, num_elems=35):

    finaldf = pd.DataFrame(columns=dataframe.columns)
    
    ratings_values = list(set(dataframe['rating']))
    ratings_values.sort()
    
    for r in ratings_values:
        
        rating_df = dataframe[dataframe['rating'] == r]
        
        # Solo añadimos ratings con representación mayor o igual a num_elems
        if len(rating_df) >= num_elems:
            mask = make_mask(len(rating_df), num_elems)
            finaldf = finaldf.append(rating_df[mask])
    
    return finaldf

In [31]:
def get_estratificate_populate_100(df_diffs, measure, num_elems=35):
    
    measure_idx = MEASURES.index(measure)
    dataframe = dataframe_similar_items[measure_idx]
    
    final_df = pd.DataFrame(columns=dataframe.columns)
    for i in range(0, 100):
        estratificado = get_estratificate_populate(dataframe, measure, num_elems=35)
        final_df = final_df.append(estratificado)
    
    return final_df

In [32]:
############ MAIN

In [33]:
# I want to split the dataset into training set and evaluation set
# I am going to use the first half of ratings to build the training set and the second one to build the evaluation set

# I build the training set
training_set = pd.read_csv('trainset.csv')

# I delete the ratings with values < 4 (movies with ratings < 4 are not interested to users)
#training_set = training_set[training_set['rating'] >= RATING_THRESHOLD]

training_set.columns = ['user', 'item', 'rating', 'timestamp']

#print(training_set)
#training_set[training_set['user'] == 545]

In [34]:
# I build the evaluation_set
#evaluation_set = df[HALF_DATASET_SIZE:]

# evaluation_set = evaluation_set[evaluation_set['rating'] != 0.5]

# evaluation_set = pd.read_csv('testset.csv')
evaluation_set = pd.read_csv('testset_stratified.csv')

evaluation_set.columns = ['user', 'item', 'rating', 'timestamp']

# print(evaluation_set)

In [36]:
# Estratificado fijo con 35 dilas para cada rating
# predicted_df_original = pd.read_csv('predicted_values_clean.csv', sep=';')
predicted_df_original = pd.read_csv('predicted_values.csv')

predicted_df_original.columns = ['user', 'item', 'rating', 'predicted']

predicted_df = predicted_df_original.copy()
# print(predicted_df)


In [37]:
predicted_df = pd.merge(evaluation_set, predicted_df_original, how='inner', on=['user', 'item'])

del predicted_df['rating_y']

predicted_df.columns = ['user', 'item', 'rating', 'timestamp', 'predicted']

# predicted_df

In [38]:
# I get the list of nodes
nodes = training_set.item.unique()

# print(len(nodes))
# print(nodes)

In [39]:
# I create a dictionary: keys are the items, and values are the list of users that are interacted with this item
grouped = training_set.groupby('item')['user'].apply(list)

#print(grouped)

In [40]:
# I create the links with the suitable format for nx
links = createLinks(grouped, nodes, WEIGHT_THRESHOLD)

# print(len(links))



In [41]:
# I create the graph
graph = create_graph_nx(nodes, links)

In [42]:
grouped_user = training_set.groupby('user')['item'].apply(list)

# diccionario que va a contener como key el user, como value, los items con los que ha interactuado el user
df_users = {}
df_users_l = {}

for i,j in zip(grouped_user.index.tolist(), grouped_user.values.tolist()):
    df_users[i] = j 
    #if len(j) >= 10:
        #df_users_l[i] = len(j) 
#df_users_l_list = list(df_users_l.keys())
#df_users_l_list

In [44]:
d = {'item': list(set(training_set['item'].tolist()))}
df_similar = pd.DataFrame(data=d)

In [45]:
# I create a dataframe that keeps the similarity values from the measures

# I add columns with the similarity values for each row
measure_df = create_measure_data(graph)

measure_df = apply_measure(measure_df, 'cn') 

#print(measure_df)


In [46]:
measure_df = apply_measure(measure_df, 'ew') 

In [47]:
measure_df = apply_measure(measure_df, 'aa')

In [48]:
measure_df = apply_measure(measure_df, 'jn')

In [49]:
measure_df = apply_measure(measure_df, 'pa')

In [50]:
measure_df = apply_measure(measure_df, 'wcn')

In [51]:
measure_df = apply_measure(measure_df, 'waa')

In [52]:
measure_df = apply_measure(measure_df, 'wpa')

In [53]:
# I build a list of list: rows for k, columns for measures
# k --> 1 a 10 (0 a 9)
# measures --> cn, ew, aa, jn, pa, wcn, waa, wpa (0 a 7)
# each cell has a dataframe for k = i and measure = j

dataframe_measures = list()

# creo una lista en cada posicion, en esa lista vamos a guardar los valores para las measures 
# obtenemos las recomendaciones para cada celda
# creo una copia par que no se modifiquen las referencias de los dataframes
dataframe_measures = [apply_getrecommendations(df_similar, measure_df, measure).copy() for measure in MEASURES] 

# en esta estructura tengo:
# por cada fila: las diferentes metricas de similitud
# guardo cuales son los items mas similares a uno dado 
# print(dataframe_measures)




In [54]:
### EVALUATION ########################################################### 

In [55]:
# devolvemos una lista de dataframes en la que se incluyen los items similares
# .copy()
dataframe_similar_items = [getSimilar(predicted_df, dataframe_measures, measure, df_users) for measure in MEASURES]
# dataframe_similar_items

In [56]:
df_similar_items_rating = [getPredRatings(dataframe_similar_items, measure, training_set) for measure in MEASURES]
# df_similar_items_rating

In [57]:
df_diffs = [getDiffs(df_similar_items_rating, measure) for measure in MEASURES]
# df_diffs

In [58]:
# Estratificado aleatorio
df_diffs = [get_estratificate_populate_100(df_diffs, measure, num_elems=35) for measure in MEASURES]
# df_diffs

In [59]:
RMSE = [getRMSE(df_diffs, measure) for measure in MEASURES]
# RMSE

In [60]:
df_RMSE = pd.DataFrame(RMSE)
df_RMSE.columns= [1,2,3,4,5,6,7,8,9,10]
df_RMSE['sim_measure'] = MEASURES
# df_RMSE

In [61]:
#df_RMSE.to_csv('RMSE_item_graph_estratificado_p.csv', index=False)