In [1]:
import pandas as pd
import numpy as np
from datetime import *
import networkx as nx
from collections import *
import functionality_1 as fun_1

In [2]:
def dateparse(time_as_a_unix_timestamp):
    return pd.to_datetime(time_as_a_unix_timestamp, unit="s").strftime("%Y-%m-%d %H:%M")

In [3]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## 1. Data

In [4]:
#Answers to questions
a2q = pd.read_csv("sx-stackoverflow-a2q.txt", sep=" " ,header=None, names=["user_a", "user_b", "time"], parse_dates=["time"], date_parser=dateparse)

#Comments to answers
c2a = pd.read_csv("sx-stackoverflow-c2a.txt", sep=" " ,header=None, names=["user_a", "user_b", "time"], parse_dates=["time"], date_parser=dateparse)

#Comments to questions
c2q = pd.read_csv("sx-stackoverflow-c2q.txt", sep=" " ,header=None, names=["user_a", "user_b", "time"], parse_dates=["time"], date_parser=dateparse)

In the merged graph we have weighted link and we have decided to build these weights in the following way:
-  we have assigned a score of 1.0 to "Answers to questions"
-  we have assigned a score of 0.7 to "Comments to questions"
-  we have assigned a score of 0.4 to "Comments to answers"

The reasons are that we have thought that the answers to questions are the more relevant in the merged graph because they are possible solutions of user's question. Then a bit less important are comments to questions, but they could be relevant to clarify the question or specify something. Comments to answer are the least relevant, but we have not wanted to penalize them too much because in some situations they could be useful.

In [5]:
#Answers to questions
a2q["weights"] = 1

#Comments to answers
c2a["weights"] = 0.4

#Comments to questions
c2q["weights"] = 0.7

Nice and simultaneous visualization of the three file!

In [6]:
display("a2q", "c2a", "c2q")

Unnamed: 0,user_a,user_b,time,weights
0,9,8,2008-08-01 05:17:00,1
1,1,1,2008-08-01 06:56:00,1
2,13,1,2008-08-01 15:57:00,1
3,17,1,2008-08-01 19:07:00,1
4,48,2,2008-08-01 19:16:00,1
...,...,...,...,...
17823520,2773607,1048138,2016-03-06 12:16:00,1
17823521,6018278,1982354,2016-03-06 12:16:00,1
17823522,3187183,1404306,2016-03-06 12:17:00,1
17823523,6022341,1667278,2016-03-06 12:17:00,1

Unnamed: 0,user_a,user_b,time,weights
0,1,91,2008-09-06 15:07:00,0.4
1,3,91,2008-09-06 15:09:00,0.4
2,380,350,2008-09-06 15:42:00,0.4
3,4642,2257,2008-09-06 20:51:00,0.4
4,4642,1324220,2008-09-06 21:15:00,0.4
...,...,...,...,...
25405369,144088,347727,2016-03-06 14:08:00,0.4
25405370,5878860,1330341,2016-03-06 14:09:00,0.4
25405371,144088,98207,2016-03-06 14:09:00,0.4
25405372,4049257,3816212,2016-03-06 14:09:00,0.4

Unnamed: 0,user_a,user_b,time,weights
0,4550,4550,2008-09-06 19:26:00,0.7
1,242,184,2008-09-06 20:38:00,0.7
2,4213,4946,2008-09-07 06:15:00,0.7
3,91,91,2008-09-07 06:18:00,0.7
4,2658,1874,2008-09-07 07:18:00,0.7
...,...,...,...,...
20268146,1177890,4917254,2016-03-06 14:06:00,0.7
20268147,3345375,4022289,2016-03-06 14:08:00,0.7
20268148,3507137,1801524,2016-03-06 14:09:00,0.7
20268149,5617035,4368648,2016-03-06 14:10:00,0.7


In [8]:
totDataframe = pd.concat([a2q, c2a, c2q])
totDataframe

Unnamed: 0,user_a,user_b,time,weights
0,9,8,2008-08-01 05:17:00,1.0
1,1,1,2008-08-01 06:56:00,1.0
2,13,1,2008-08-01 15:57:00,1.0
3,17,1,2008-08-01 19:07:00,1.0
4,48,2,2008-08-01 19:16:00,1.0
...,...,...,...,...
20268146,1177890,4917254,2016-03-06 14:06:00,0.7
20268147,3345375,4022289,2016-03-06 14:08:00,0.7
20268148,3507137,1801524,2016-03-06 14:09:00,0.7
20268149,5617035,4368648,2016-03-06 14:10:00,0.7


In [9]:
class EDGE:
    
    def __init__(self, from_n, to_n, time, w):
        self.from_n = from_n
        self.to_n = to_n
        self.w = w
        self.time = time
        
    def __repr__(self):
        return f"{self.from_n} -> {self.to_n} ::::: weight = {self.w} ::::: time: {self.time}"

In [10]:
class GRAPH:
    
    def __init__(self, nodes = [], edges = []):
        self.nodes = defaultdict(list, {k:[] for k in nodes})
        
        if(len(edges)>0):
            for edge in edges:
                self.nodes[edge.from_n].append(edge)
                self.nodes[edge.to_n].append(edge)
        
        self.num_nodes = len(nodes)
        self.num_edges = len(edges)
        
    def add_edge(self, from_n, to_n, w, time):
        e = EDGE(from_n, to_n, w, time)
        
        self.nodes[from_n].append(e)
        self.nodes[to_n].append(e)

    def __repr__(self):
        return f"{self.nodes}"

## Functionality 1 - Get the overall features of the graph

In [12]:
#array of nodes of graph a2q
nodi_grafo = pd.concat([a2q["user_a"], a2q["user_b"]], axis = 0).drop_duplicates().array


In [13]:
edge_grafo = []
for i in range(len(a2q)): 
    edge_grafo.append(EDGE(a2q.iloc[i,0],a2q.iloc[i,1], a2q.iloc[i,2], a2q.iloc[i,3]))

In [14]:
#creation of the graph a2q
a2q_graph = GRAPH(nodi_grafo, edge_grafo)

In [15]:
#Number of users
num_node = a2q_graph.num_nodes
num_node

2464606

In [None]:
num_edge = a2q_graph.num_edge

In [None]:
number_nodes, number_edges, directed, mean, density = fun_1.functionality(a2q)

In [75]:
end = 0
while (end == 0):
    
    user = int(input("Welcome user, what functionality do you want: "))
    
    if (user == 1):
        
        print("Functionality 1 - Get the overall features of the graph")
        print("The a2q is: " + directed)
        print("Number of users: " +str(number_node))
        print("Number of answers: " +str(number_edge))
        print("Average number of links per user is: " + str(mean))
        print(f"The graph density is: {density}, so the graph is sparse")
        
        end = 1
        
    if (user == 2):
        
        print("result of func_2")
        print("Functionality 2 - Find the best users!")
        end = 1
        
    if (user == 3):
        print("Functionality 3 - Shortest Ordered Route")
        print("result of func_3")
        end = 1


Welcome user, what functionality do you want: 2
result of func_2
Functionality 2 - Find the best users!


In [None]:
import random as rd
def starting_nodes(df):
    #array of nodes of graph a2q
    nodi_grafo = pd.concat([df["user_a"], df["user_b"]], axis = 0).drop_duplicates().array
    start_p = rd.choice(nodi_grafo)
    end_p = rd.choice(nodi_grafo)
    return start_p, end_p

p_start, p_end = starting_nodes(totDataframe)

In [38]:
p_start

812508

In [39]:
p_end

1213383

In [None]:
def functionality_3(start_time, end_time, p, p_1, p_n):

In [None]:
def grafo(df): 
    #list of nodes
    nodi_grafo = pd.concat([df["user_a"], df["user_b"]], axis = 0).drop_duplicates().array
    
    #list of edges 
    edge_grafo = []
    
    #read each row of the dataframe
    for i in range(len(df)): 
        
        #exploit the class edge to create them
        edge_grafo.append(EDGE(df.iloc[i,0], df.iloc[i,1], df.iloc[i,2], df.iloc[i,3]))
        
    #creation of the graph a2q
    df_graph = GRAPH(nodi_grafo, edge_grafo)
    
    return df_graph

In [None]:
import math
def dijkstra(df, starting_point):
    
    #creation of the graph 
    tot_grafo = grafo(df)
    
    #total number of nodes
    n = len(tot_grafo.num_nodes)
    
    #set all the distances equal to infinite
    distances = [math.inf] * n
    
    #set the distance of the starting node equal to zero
    distances[list(a2q_graph.nodes.keys()).index(starting_point)] = 0 
    
    #
    visited = [0] * n
    
    while (0 in visited) == True:
    