In [1]:
import pandas as pd
import numpy as np
from datetime import *
import networkx as nx
from collections import *
from tqdm.notebook import tqdm
import swifter
import dask
from multiprocessing.dummy import Pool
import multiprocessing
import matplotlib.pyplot as plt
import networkx.drawing
import pickle

In [2]:
def dateparse(time_as_a_unix_timestamp):
    return pd.to_datetime(time_as_a_unix_timestamp, unit="s").strftime("%Y-%m-%d")

def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)
        
def read_object(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

In [3]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## 1. Data


```python
#Answers to questions
a2q = pd.read_csv("data/sx-stackoverflow-a2q.txt", sep=" " ,header=None, names=["user_a", "user_b", "time"], parse_dates=["time"], date_parser=dateparse)

#Comments to answers
c2a = pd.read_csv("data/sx-stackoverflow-c2a.txt", sep=" " ,header=None, names=["user_a", "user_b", "time"], parse_dates=["time"], date_parser=dateparse)

#Comments to questions
c2q = pd.read_csv("data/sx-stackoverflow-c2q.txt", sep=" " ,header=None, names=["user_a", "user_b", "time"], parse_dates=["time"], date_parser=dateparse)


```

```python
a2q.to_csv("data/a2q.csv", index=False)
c2q.to_csv("data/c2q.csv", index=False)
c2a.to_csv("data/c2a.csv", index=False)
```

In [4]:
a2q = pd.read_csv("data/a2q.csv", parse_dates=["time"], infer_datetime_format=True)
c2q = pd.read_csv("data/c2q.csv", parse_dates=["time"], infer_datetime_format=True)
c2a = pd.read_csv("data/c2a.csv", parse_dates=["time"], infer_datetime_format=True)

display("a2q", "c2q", "c2a")

Unnamed: 0,user_a,user_b,time
0,9,8,2008-08-01
1,1,1,2008-08-01
2,13,1,2008-08-01
3,17,1,2008-08-01
4,48,2,2008-08-01
...,...,...,...
17823520,2773607,1048138,2016-03-06
17823521,6018278,1982354,2016-03-06
17823522,3187183,1404306,2016-03-06
17823523,6022341,1667278,2016-03-06

Unnamed: 0,user_a,user_b,time
0,4550,4550,2008-09-06
1,242,184,2008-09-06
2,4213,4946,2008-09-07
3,91,91,2008-09-07
4,2658,1874,2008-09-07
...,...,...,...
20268146,1177890,4917254,2016-03-06
20268147,3345375,4022289,2016-03-06
20268148,3507137,1801524,2016-03-06
20268149,5617035,4368648,2016-03-06

Unnamed: 0,user_a,user_b,time
0,1,91,2008-09-06
1,3,91,2008-09-06
2,380,350,2008-09-06
3,4642,2257,2008-09-06
4,4642,1324220,2008-09-06
...,...,...,...
25405369,144088,347727,2016-03-06
25405370,5878860,1330341,2016-03-06
25405371,144088,98207,2016-03-06
25405372,4049257,3816212,2016-03-06


In the merged graph we have weighted link and we have decided to build these weights in the following way:
-  we have assigned a score of 1.0 to "Answers to questions"
-  we have assigned a score of 0.7 to "Comments to questions"
-  we have assigned a score of 0.4 to "Comments to answers"

The reasons are that we have thought that the answers to questions are the more relevant in the merged graph because they are possible solutions of user's question. Then a bit less important are comments to questions, but they could be relevant to clarify the question or specify something. Comments to answer are the least relevant, but we have not wanted to penalize them too much because in some situations they could be useful.

In [5]:
#Answers to questions
a2q["weights"] = 1

#Comments to answers
c2a["weights"] = 0.4

#Comments to questions
c2q["weights"] = 0.7

Nice and simultaneous visualization of the three file!

In [6]:
display("a2q", "c2a", "c2q")

Unnamed: 0,user_a,user_b,time,weights
0,9,8,2008-08-01,1
1,1,1,2008-08-01,1
2,13,1,2008-08-01,1
3,17,1,2008-08-01,1
4,48,2,2008-08-01,1
...,...,...,...,...
17823520,2773607,1048138,2016-03-06,1
17823521,6018278,1982354,2016-03-06,1
17823522,3187183,1404306,2016-03-06,1
17823523,6022341,1667278,2016-03-06,1

Unnamed: 0,user_a,user_b,time,weights
0,1,91,2008-09-06,0.4
1,3,91,2008-09-06,0.4
2,380,350,2008-09-06,0.4
3,4642,2257,2008-09-06,0.4
4,4642,1324220,2008-09-06,0.4
...,...,...,...,...
25405369,144088,347727,2016-03-06,0.4
25405370,5878860,1330341,2016-03-06,0.4
25405371,144088,98207,2016-03-06,0.4
25405372,4049257,3816212,2016-03-06,0.4

Unnamed: 0,user_a,user_b,time,weights
0,4550,4550,2008-09-06,0.7
1,242,184,2008-09-06,0.7
2,4213,4946,2008-09-07,0.7
3,91,91,2008-09-07,0.7
4,2658,1874,2008-09-07,0.7
...,...,...,...,...
20268146,1177890,4917254,2016-03-06,0.7
20268147,3345375,4022289,2016-03-06,0.7
20268148,3507137,1801524,2016-03-06,0.7
20268149,5617035,4368648,2016-03-06,0.7


In [7]:
totDataframe = pd.concat([a2q, c2a, c2q])
totDataframe

Unnamed: 0,user_a,user_b,time,weights
0,9,8,2008-08-01,1.0
1,1,1,2008-08-01,1.0
2,13,1,2008-08-01,1.0
3,17,1,2008-08-01,1.0
4,48,2,2008-08-01,1.0
...,...,...,...,...
20268146,1177890,4917254,2016-03-06,0.7
20268147,3345375,4022289,2016-03-06,0.7
20268148,3507137,1801524,2016-03-06,0.7
20268149,5617035,4368648,2016-03-06,0.7


In [8]:
totDataframe_2y = totDataframe[totDataframe.time.between("2015","2017")]
a2q_2y = a2q[a2q.time.between("2015","2017")]
c2q_2y = c2q[c2q.time.between("2015","2017")]
c2a_2y = c2a[c2a.time.between("2015","2017")]
display("a2q_2y", "c2a_2y", "c2q_2y", "totDataframe_2y")

Unnamed: 0,user_a,user_b,time,weights
14055617,335858,1045881,2015-01-01,1
14055618,3829874,3829874,2015-01-01,1
14055619,315228,3849055,2015-01-01,1
14055620,2182521,2182521,2015-01-01,1
14055621,3047078,4059893,2015-01-01,1
...,...,...,...,...
17823520,2773607,1048138,2016-03-06,1
17823521,6018278,1982354,2016-03-06,1
17823522,3187183,1404306,2016-03-06,1
17823523,6022341,1667278,2016-03-06,1

Unnamed: 0,user_a,user_b,time,weights
19731113,3516348,3516348,2015-01-01,0.4
19731114,2138993,2138993,2015-01-01,0.4
19731115,3040948,3040948,2015-01-01,0.4
19731116,307339,1115059,2015-01-01,0.4
19731117,67579,67579,2015-01-01,0.4
...,...,...,...,...
25405369,144088,347727,2016-03-06,0.4
25405370,5878860,1330341,2016-03-06,0.4
25405371,144088,98207,2016-03-06,0.4
25405372,4049257,3816212,2016-03-06,0.4

Unnamed: 0,user_a,user_b,time,weights
14184111,34397,4322629,2015-01-01,0.7
14184112,176075,3931833,2015-01-01,0.7
14184113,180329,180329,2015-01-01,0.7
14184114,1577580,8741,2015-01-01,0.7
14184115,2145018,2145018,2015-01-01,0.7
...,...,...,...,...
20268146,1177890,4917254,2016-03-06,0.7
20268147,3345375,4022289,2016-03-06,0.7
20268148,3507137,1801524,2016-03-06,0.7
20268149,5617035,4368648,2016-03-06,0.7

Unnamed: 0,user_a,user_b,time,weights
14055617,335858,1045881,2015-01-01,1.0
14055618,3829874,3829874,2015-01-01,1.0
14055619,315228,3849055,2015-01-01,1.0
14055620,2182521,2182521,2015-01-01,1.0
14055621,3047078,4059893,2015-01-01,1.0
...,...,...,...,...
20268146,1177890,4917254,2016-03-06,0.7
20268147,3345375,4022289,2016-03-06,0.7
20268148,3507137,1801524,2016-03-06,0.7
20268149,5617035,4368648,2016-03-06,0.7


In [9]:
class EDGE:
    
    def __init__(self, from_n, to_n, time, w):
        self.from_n = from_n
        self.to_n = to_n
        self.w = w
        self.time = time
        
    def __repr__(self):
        return f"{self.from_n} -> {self.to_n} ::::: weight = {self.w} ::::: time: {self.time}"

In [10]:
class GRAPH:
    
    def __init__(self, nodes = [], edges = []):
        self.nodes = defaultdict(list, {k:[] for k in nodes})
        
        if(len(edges)>0):
            for edge in tqdm(edges):
                self.nodes[edge.from_n].append(edge)
                self.nodes[edge.to_n].append(edge)
        
        self.num_nodes = len(nodes)
        self.num_edges = len(edges)
        
    def add_edge(self, from_n, to_n, w, time):
        e = EDGE(from_n, to_n, w, time)
        
        self.nodes[from_n].append(e)
        self.nodes[to_n].append(e)
        
    def add_edge_object(self, edge):
        
        self.nodes[edge.from_n].append(edge)
        self.nodes[edge.to_n].append(edge)

    def __repr__(self):
        return f"{self.nodes}"

## Functionality 2 - Find the best users!