In [1]:
# LIBRARIES

import pandas as pd
import numpy as np
import itertools as it
from datetime import datetime
import networkx as nx
import math
# Documentacion de la libreria: http://networkx.readthedocs.io/en/networkx-1.11/

from operator import itemgetter
from itertools import groupby

import plotly.graph_objects as go

In [2]:
# CONSTANTS 

# DATASET_SIZE = 100000
DATASET_SIZE =  15954 # most popular movies 200 && users que han interactuado con 50%
# HALF_DATASET_SIZE = int(90*DATASET_SIZE / 100)
# HALF_DATASET_SIZE = int(75*DATASET_SIZE / 100)
# SECOND_HALF_DATASET = int(DATASET_SIZE - HALF_DATASET_SIZE)
RATING_THRESHOLD = 4
WEIGHT_THRESHOLD = 5
K = 10
# range_K_10 = range(K)
MEASURES = ['aa', 'cn', 'ew', 'jn', 'pa', 'waa', 'wcn', 'wpa']
USERS_EVAL = 25

In [3]:
# FUNCTIONS AND SUBPROGRAMS

In [4]:
def compareNodes(f_list, s_list):
    """
        Function that returns the number of users that have interact with both items
        Funcion que devuelve el numero de usuarios que han interactuado con ambos items
    """
    peso = len(np.intersect1d(f_list, s_list))
    
    return peso
    
def createLinks(prob_us_set, nodos, threshold):
    """
        Function that creates graph links with the information about the set. The weight has to be grater or equal to threshold.
        
        Funcion que crea los enlaces del grafo a partir de la informacion contenida en el conjunto que se le
        pasa a la funcion. El peso tiene que ser mayor o igual al umbral.
        
        Format of links list -> [(Node1, Node2, weight), ......]
    """
    resultado = list() 
    
    # hago todas las posibles combinaciones de problemas
    for fst, snd in it.combinations(nodos, 2):
        # obtengo el peso pasando la lista de usuarios que ha hecho cada problema
        peso = compareNodes(prob_us_set[fst], prob_us_set[snd])
        if peso >= threshold:
            resultado.append((fst, snd, peso))
            
            
            
    return resultado

In [5]:
def create_graph_nx(list_nodes, list_links):
    """
        Function that creates a graph with the format from NetworkX 
        
        Funcion que crea un grafo de tipo Graph de la libreria NetworkX
        Construccion del grafo: http://networkx.readthedocs.io/en/networkx-1.11/tutorial/tutorial.html#what-to-use-as-nodes-and-edges
    """
    grafo = nx.Graph() # creo la variable grafo

    # incluyo los nodos del grafo 
    grafo.add_nodes_from(list_nodes)

    # se incluyen las tuplas de enlaces con el peso del enlace
    # es una lista de la forma [(Nodo1, Nodo2, peso), ......]
    grafo.add_weighted_edges_from(list_links)

    return grafo

In [6]:
def lenProblemsDone(row, set_filter):
    """
        Function that calculates the size of the item list that each user has interact with
        
        Funcion auxiliar que calcula con cuantos items ha interactuado cada usuario en un conjunto: training o evaluation
    """
    # saco el dataframe que contendra solo una fila con la lista de problemas que ha hecho el usuario
    df_filter = set_filter[set_filter['user_id'] == row['user_id']]
    
    if df_filter.empty:
        # si esta vacio, entonces es que el usuario no ha hecho problemas en ese conjunto
        return 0
    else:
        # sino, devuelvo la longitud de la lista de problemas
        return len(df_filter['list_item_id'].iloc[0]) 
    

In [7]:
def write_results_file(dir, result, k, measure):
    f = open(dir, 'a')
    f.write(str(k) + ',' + measure + ',' + str(result['one_hit']) + ',' + str(result['precision']) + ',' + str(result['mrr']) + ',' + str(result['recall']) + ',' +  str(result['f1']) +  '\n') 
    f.close()

In [8]:
############ MAIN

In [9]:
# df contains the MovieLens dataset
df = pd.read_csv('most_pop_200_users.csv')


print(df)

       user  item  rating  timestamp
0       305   451       3  886324817
1        62   257       2  879372434
2       194   274       2  879539794
3       299   144       4  877881320
4       308     1       4  887736532
...     ...   ...     ...        ...
15949   864   685       4  888891900
15950   279    64       1  875308510
15951   660   229       2  891406212
15952   880   476       3  880175444
15953   716   204       5  879795543

[15954 rows x 4 columns]


In [10]:
# df_sort contains the MovieLens dataset order by timestamp
df = df.sort_values('timestamp')

print(df)

# I check that all is ok
num_items = len(df.item.unique())
num_users = len(df.user.unique())

print(num_items)
print(num_users)

       user  item  rating  timestamp
3590    276   258       5  874786337
15845   276   300       4  874786338
7398    276   328       4  874786366
145     276   294       4  874786366
2449    276   288       4  874786392
...     ...   ...     ...        ...
9791    796   393       4  893218933
9546    796   419       5  893219001
15928   407     7       4  893253637
6835    653   272       4  893275949
12346   653   245       4  893276091

[15954 rows x 4 columns]
200
129


In [11]:
# # I want to split the dataset into training set and evaluation set
# # I am going to use the first half of ratings to build the training set and the second one to build the evaluation set

# # I build the training set
# training_set = df[:HALF_DATASET_SIZE]

# print(len(training_set))

# # I delete the ratings with values < 4 (movies with ratings < 4 are not interested to users)
# training_set = training_set[training_set['rating'] >= RATING_THRESHOLD]

# print(training_set)

# #training_set.to_csv("training.csv", index=False)

In [12]:
# # I build the evaluation_set
# evaluation_set = df[HALF_DATASET_SIZE:]

# print(evaluation_set)

# # I delete the ratings with values < 4 (movies with ratings < 4 are not interested to users)
# evaluation_set = evaluation_set[evaluation_set['rating'] >= RATING_THRESHOLD]

# print(evaluation_set)

# #evaluation_set.to_csv("evaluation.csv", index=False)

In [13]:
training_set = pd.read_csv('training_most_pop.csv')
evaluation_set = pd.read_csv('evaluation_most_pop.csv')

In [14]:
# I get the list of nodes
nodes = training_set.item.unique()

print(len(nodes))
print(nodes)

200
[258 300 328 294 288 508 298 181   7 273 628   1 117 151 127 276 100 284
 471 257 410 147  25 240 235 237 742 475 685 250 411 845 125 121 150 180
 175  12 135  64  28  23 523  22 318  11 357 182 197 227 143 451  70 732
 423 193  69 179 187 215  58 655   8 168   4  56 209 153 204 154 202 433
 692 173 186 210 393 174  89  24 144 176  82 183 172  96  79 684 195 265
 226 188 385 566  98 218 185 164 200 443  95 432  71 588 419  99 603  31
 479 326 286 289 321 322 748 678  50 222 255 111 597 118 275 515 124 283
 137 196  14 248 301 511 134 238 496 483 514 132 211 230 403 498 228 208
 327 474  15  13 199 161 191 194  97 203 216 234 427 582 480 527 133 282
 431  66  88 259 323 285   9  83 435 660 367 302 268 333 568 651 402 591
 546 269 405 229 319 252 472 739 274 476 690  77 245 307 271 340 751 313
 272 315]


In [15]:
# I create a dictionary: keys are the items, and values are the list of users that are interacted with this item
grouped = training_set.groupby('item')['user'].apply(list)

#print(grouped)

In [16]:
# I create the links with the suitable format for nx
links = createLinks(grouped, nodes, WEIGHT_THRESHOLD)

print(len(links))



18125


In [17]:
# I create the graph
graph = create_graph_nx(nodes, links)

In [18]:
# I am going to define the target users. The list of target users will be the users that have interact with 5 items al least 
# in the training set and in the evaluation set. 

# The creation of the list...

# creo un diccionario que va a tener a los usuarios como keys y a los items con los que ha interactuado como valores
# a partir del conjunto de entrenamiento
grouped_user = training_set.groupby('user')['item'].apply(list)

# convierto la serie en un dataframe
df_users = pd.DataFrame({'user_id':grouped_user.index, 'list_item_id':grouped_user.values})

print(df_users)

     user_id                                       list_item_id
0          1  [168, 172, 196, 187, 250, 14, 127, 181, 1, 50,...
1          6  [269, 302, 182, 275, 127, 100, 9, 124, 14, 515...
2          7  [300, 288, 286, 307, 64, 174, 603, 187, 180, 3...
3         13  [268, 302, 12, 56, 98, 514, 474, 168, 197, 433...
4         18  [269, 286, 319, 427, 275, 357, 56, 98, 216, 48...
..       ...                                                ...
124      892  [300, 357, 318, 100, 483, 393, 195, 523, 172, ...
125      896  [258, 182, 187, 234, 22, 273, 204, 496, 153, 9...
126      897  [288, 323, 22, 82, 98, 204, 69, 419, 96, 199, ...
127      916  [286, 268, 100, 150, 475, 1, 7, 250, 9, 14, 18...
128      919  [300, 288, 258, 327, 259, 323, 275, 100, 508, ...

[129 rows x 2 columns]


In [19]:
# I build a dataframe with the users in the evaluation set as key and the items they have interact with as values 

# creo un diccionario que va a tener a los usuarios como keys y a los problemas que ha hecho como valores
# a partir del conjunto de entrenamiento
grouped_user_eval = evaluation_set.groupby('user')['item'].apply(list)

# convierto la serie en un dataframe
df_users_eval = pd.DataFrame({'user_id':grouped_user_eval.index, 'list_item_id':grouped_user_eval.values})

print(df_users_eval)

    user_id                                       list_item_id
0        18    [13, 97, 432, 186, 318, 238, 423, 137, 132, 64]
1        43  [204, 405, 402, 403, 393, 367, 313, 302, 269, ...
2       144    [196, 393, 127, 153, 180, 4, 173, 66, 197, 480]
3       184   [588, 451, 66, 210, 71, 132, 498, 275, 283, 357]
4       222  [182, 357, 127, 185, 431, 588, 433, 685, 313, ...
5       269  [514, 132, 200, 432, 151, 56, 496, 175, 191, 475]
6       279   [229, 228, 410, 319, 238, 153, 7, 474, 230, 269]
7       301    [58, 143, 66, 732, 196, 431, 443, 451, 546, 99]
8       305    [186, 7, 655, 183, 196, 475, 70, 151, 628, 660]
9       339     [660, 143, 25, 99, 88, 431, 82, 546, 240, 194]
10      347      [403, 77, 95, 4, 200, 188, 423, 28, 692, 227]
11      363   [235, 238, 8, 393, 227, 597, 298, 591, 248, 443]
12      393      [143, 97, 732, 95, 479, 4, 96, 651, 227, 302]
13      399   [588, 97, 393, 11, 188, 203, 218, 143, 566, 196]
14      417  [385, 651, 433, 402, 403, 451, 268, 326, 2

In [20]:
# # The filter of the target users list...

# # aqui voy a hacer el filtro de usuarios de forma que para hacer las recomendaciones solo tengamos en 
# # cuenta aquellos usuarios que han hecho 5 o mas problemas tanto antes de la fecha limite como despues

# # primero guardo la lista de usuarios
# user_list = df.user.unique()

# # la meto en un dataframe 
# column_user_filter = {'user_id': user_list}
# datraframe_user_filter = pd.DataFrame.from_dict(column_user_filter)


# # ahora tengo que calcular para cada fila, el numero de problemas que han hecho en el training_set, evaluation_set
# datraframe_user_filter['len_training'] = datraframe_user_filter.apply (lambda row: lenProblemsDone(row, df_users), axis=1)
# datraframe_user_filter['len_evaluation'] = datraframe_user_filter.apply (lambda row: lenProblemsDone(row, df_users_eval), axis=1)
# print(datraframe_user_filter)

In [21]:
# # ahora tengo que hacer el filtro en este dataframe, de forma que solo aparezcan las filas en las que len_training y 
# # len_evaluation sea >=5
# datraframe_user_filter = datraframe_user_filter[(datraframe_user_filter['len_training'] >= 10) & (datraframe_user_filter['len_evaluation'] >=10)]
# # datraframe_user_filter = datraframe_user_filter[(datraframe_user_filter['len_training'] <= 25) & (datraframe_user_filter['len_evaluation'] <= 25)]
# # datraframe_user_filter = datraframe_user_filter[(datraframe_user_filter['len_training'] >= 1) & (datraframe_user_filter['len_evaluation'] >= 1)]

# # datraframe_user_filter = datraframe_user_filter[(datraframe_user_filter['len_training'] <= 30)]
# # datraframe_user_filter = datraframe_user_filter[(datraframe_user_filter['len_training'] >= 1) & (datraframe_user_filter['len_evaluation'] >= 1)]


# print(datraframe_user_filter)

# # aqui voy a guardar la lista de usuarios a los que voy a recomendar
# user_list_to_recommend = sorted(datraframe_user_filter['user_id'].tolist())
# print(user_list_to_recommend)
# print(len(user_list_to_recommend))

In [22]:
user_list_to_recommend = list(evaluation_set.user.unique())

In [23]:
# ahora tengo que filtrar df_users para que solo contenga las filas en las que los usuarios
# pertenecen a la anterior lista

df_users = df_users[df_users['user_id'].isin(user_list_to_recommend)]
print(df_users)

     user_id                                       list_item_id
4         18  [269, 286, 319, 427, 275, 357, 56, 98, 216, 48...
6         43  [258, 286, 294, 328, 289, 301, 300, 50, 181, 2...
19       144  [286, 258, 313, 326, 294, 751, 50, 285, 117, 1...
23       184  [340, 286, 313, 321, 272, 127, 50, 181, 250, 5...
28       222  [258, 268, 328, 300, 326, 333, 100, 181, 7, 50...
32       269  [315, 268, 515, 340, 124, 285, 127, 137, 276, ...
35       279  [321, 100, 151, 248, 24, 257, 25, 154, 168, 17...
41       301  [7, 250, 127, 181, 237, 288, 24, 1, 150, 222, ...
43       305  [286, 690, 302, 269, 258, 289, 315, 181, 50, 2...
50       339  [483, 98, 327, 435, 357, 508, 56, 186, 234, 47...
54       347  [300, 333, 258, 328, 288, 268, 245, 222, 181, ...
55       363  [313, 302, 288, 307, 271, 496, 11, 173, 196, 8...
60       393  [315, 258, 272, 313, 690, 294, 328, 322, 259, ...
61       399  [302, 301, 328, 289, 147, 1, 475, 15, 742, 235...
66       417  [258, 127, 15, 147, 515, 2

In [24]:
# hago el filtro para los usuarios a los que tengo que recomendar
df_users_eval = df_users_eval[df_users_eval['user_id'].isin(user_list_to_recommend)]
print(df_users_eval)

    user_id                                       list_item_id
0        18    [13, 97, 432, 186, 318, 238, 423, 137, 132, 64]
1        43  [204, 405, 402, 403, 393, 367, 313, 302, 269, ...
2       144    [196, 393, 127, 153, 180, 4, 173, 66, 197, 480]
3       184   [588, 451, 66, 210, 71, 132, 498, 275, 283, 357]
4       222  [182, 357, 127, 185, 431, 588, 433, 685, 313, ...
5       269  [514, 132, 200, 432, 151, 56, 496, 175, 191, 475]
6       279   [229, 228, 410, 319, 238, 153, 7, 474, 230, 269]
7       301    [58, 143, 66, 732, 196, 431, 443, 451, 546, 99]
8       305    [186, 7, 655, 183, 196, 475, 70, 151, 628, 660]
9       339     [660, 143, 25, 99, 88, 431, 82, 546, 240, 194]
10      347      [403, 77, 95, 4, 200, 188, 423, 28, 692, 227]
11      363   [235, 238, 8, 393, 227, 597, 298, 591, 248, 443]
12      393      [143, 97, 732, 95, 479, 4, 96, 651, 227, 302]
13      399   [588, 97, 393, 11, 188, 203, 218, 143, 566, 196]
14      417  [385, 651, 433, 402, 403, 451, 268, 326, 2

In [25]:
# primero voy a ordenar la lista de usuarios a recomendar
user_list_to_recommend.sort()

list_eval_items = df_users_eval['list_item_id'].tolist()

print(len(user_list_to_recommend))
print(len(list_eval_items))

25
25


In [26]:
df_users_eval["num_relevant"] = df_users_eval.apply (lambda row: len(row['list_item_id']), axis=1)

In [27]:
# diccionario que va a contener como key el user, como value, los items con los que ha interactuado el user
df_users_simple = {}

for i,j in zip(grouped_user.index.tolist(), grouped_user.values.tolist()):
    df_users_simple[i] = j 
#df_users_simple

In [28]:
df_users_eval["films_watched"] = df_users_eval.apply (lambda row: df_users_simple[row['user_id']], axis=1)

In [29]:
def nodes_connected(u, v, graph):
    return u in graph.neighbors(v)

In [30]:
def areAccessible(relevant, possible):
    access = list()
    for r in relevant:
        for p in possible:
            if nodes_connected(r, p, graph):
                access.append(r)
    return list(set(access))

In [31]:
df_users_eval["rel_accessible"] = df_users_eval.apply (lambda row: areAccessible(row['list_item_id'], row['films_watched']), axis=1)

In [32]:
df_users_eval["num_accessible"] = df_users_eval.apply (lambda row: len(row['rel_accessible']), axis=1)

In [33]:
df_users_eval = df_users_eval[df_users_eval['num_accessible'] < 100]
df_users_eval

Unnamed: 0,user_id,list_item_id,num_relevant,films_watched,rel_accessible,num_accessible
0,18,"[13, 97, 432, 186, 318, 238, 423, 137, 132, 64]",10,"[269, 286, 319, 427, 275, 357, 56, 98, 216, 48...","[64, 97, 132, 423, 137, 13, 238, 432, 186, 318]",10
1,43,"[204, 405, 402, 403, 393, 367, 313, 302, 269, ...",10,"[258, 286, 294, 328, 289, 301, 300, 50, 181, 2...","[393, 204, 269, 302, 367, 402, 403, 405, 313, ...",10
2,144,"[196, 393, 127, 153, 180, 4, 173, 66, 197, 480]",10,"[286, 258, 313, 326, 294, 751, 50, 285, 117, 1...","[480, 66, 196, 4, 197, 393, 173, 180, 153, 127]",10
3,184,"[588, 451, 66, 210, 71, 132, 498, 275, 283, 357]",10,"[340, 286, 313, 321, 272, 127, 50, 181, 250, 5...","[66, 451, 132, 357, 71, 588, 210, 498, 275, 283]",10
4,222,"[182, 357, 127, 185, 431, 588, 433, 685, 313, ...",10,"[258, 268, 328, 300, 326, 333, 100, 181, 7, 50...","[288, 357, 313, 588, 685, 431, 433, 182, 185, ...",10
5,269,"[514, 132, 200, 432, 151, 56, 496, 175, 191, 475]",10,"[315, 268, 515, 340, 124, 285, 127, 137, 276, ...","[514, 132, 200, 175, 432, 496, 151, 56, 475, 191]",10
6,279,"[229, 228, 410, 319, 238, 153, 7, 474, 230, 269]",10,"[321, 100, 151, 248, 24, 257, 25, 154, 168, 17...","[228, 229, 230, 7, 269, 238, 474, 153, 410, 319]",10
7,301,"[58, 143, 66, 732, 196, 431, 443, 451, 546, 99]",10,"[7, 250, 127, 181, 237, 288, 24, 1, 150, 222, ...","[66, 451, 196, 546, 99, 143, 431, 58, 443, 732]",10
8,305,"[186, 7, 655, 183, 196, 475, 70, 151, 628, 660]",10,"[286, 690, 302, 269, 258, 289, 315, 181, 50, 2...","[196, 70, 7, 655, 628, 660, 183, 151, 186, 475]",10
9,339,"[660, 143, 25, 99, 88, 431, 82, 546, 240, 194]",10,"[483, 98, 327, 435, 357, 508, 56, 186, 234, 47...","[546, 99, 194, 143, 431, 240, 82, 660, 88, 25]",10


In [34]:
len(df_users_eval)

25

In [35]:
del df_users_eval['list_item_id']
del df_users_eval['films_watched']
# del df_users_eval['rel_accessible']
del df_users_eval['num_relevant']
del df_users_eval['num_accessible']
df_users_eval

Unnamed: 0,user_id,rel_accessible
0,18,"[64, 97, 132, 423, 137, 13, 238, 432, 186, 318]"
1,43,"[393, 204, 269, 302, 367, 402, 403, 405, 313, ..."
2,144,"[480, 66, 196, 4, 197, 393, 173, 180, 153, 127]"
3,184,"[66, 451, 132, 357, 71, 588, 210, 498, 275, 283]"
4,222,"[288, 357, 313, 588, 685, 431, 433, 182, 185, ..."
5,269,"[514, 132, 200, 175, 432, 496, 151, 56, 475, 191]"
6,279,"[228, 229, 230, 7, 269, 238, 474, 153, 410, 319]"
7,301,"[66, 451, 196, 546, 99, 143, 431, 58, 443, 732]"
8,305,"[196, 70, 7, 655, 628, 660, 183, 151, 186, 475]"
9,339,"[546, 99, 194, 143, 431, 240, 82, 660, 88, 25]"


In [36]:
df_users_eval.to_csv('items_accessible_mostact.csv', index=False)

In [37]:
# del df_users_eval['user_id']
# del df_users_eval['num_relevant']
# df_users_eval

In [38]:
# Hacer la cuenta de cuantos valores hay de accesibles 

# df = pd.DataFrame(data, columns = ['num_accessible'])

# df_count = df[['num_accesible']].groupby(['num_accessible']).count()
# df_count.head()

In [39]:
# trace = go.Histogram(x=df_count['nodeId'], # cambiar nodeId
#                       xbins=dict(
#                          # start=0,
#                          # end=100,
#                           size=5), 
#                       autobinx=False
#                      )

# fig = go.Figure(data=trace)

# fig.show()