In [2]:
import pickle
import numpy as np
import pandas as pd
import networkx as nx
from collections import Counter

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## preprocessing Enron txt file with tuples representing the (time, sender, receiver) 

In [3]:
emails = pd.read_csv('datasets/execs.email.linesnum.txt', delimiter=' ', header=None, names=['time', 'from', 'to'])
emails['time'] = pd.to_datetime(emails['time'],unit='s')
emails = emails[(emails['time']>='1998-11-01') & (emails['time']<'2002-07-01')]

In [998]:
emails['week'] = emails['time'].dt.year.astype(str).str.cat(emails['time'].dt.week.astype(str).str.zfill(2),sep='-')

In [1014]:
unique_weeks = emails['week'].unique()
enron_glist = [[] for _ in range(unique_weeks.shape[-1])]
for idx, week in enumerate(unique_weeks):
    print(f"----- {week} -----")
    edges = list(map(tuple, emails[emails['week']==week][['from', 'to']].values))
    g = nx.DiGraph((x, y, {'weight': v}) for (x, y), v in Counter(edges).items())
    enron_glist[idx] = g
save_object(enron_glist, 'datasets/enron_weekly.pkl')

----- 1998-46 -----
----- 1998-47 -----
----- 1998-48 -----
----- 1998-49 -----
----- 1998-50 -----
----- 1998-51 -----
----- 1998-52 -----
----- 1998-53 -----
----- 1999-01 -----
----- 1999-02 -----
----- 1999-03 -----
----- 1999-04 -----
----- 1999-05 -----
----- 1999-06 -----
----- 1999-08 -----
----- 1999-09 -----
----- 1999-10 -----
----- 1999-11 -----
----- 1999-12 -----
----- 1999-13 -----
----- 1999-15 -----
----- 1999-18 -----
----- 1999-19 -----
----- 1999-20 -----
----- 1999-21 -----
----- 1999-22 -----
----- 1999-23 -----
----- 1999-24 -----
----- 1999-25 -----
----- 1999-26 -----
----- 1999-27 -----
----- 1999-28 -----
----- 1999-29 -----
----- 1999-30 -----
----- 1999-31 -----
----- 1999-32 -----
----- 1999-33 -----
----- 1999-34 -----
----- 1999-35 -----
----- 1999-36 -----
----- 1999-37 -----
----- 1999-38 -----
----- 1999-39 -----
----- 1999-40 -----
----- 1999-41 -----
----- 1999-42 -----
----- 1999-43 -----
----- 1999-44 -----
----- 1999-45 -----
----- 1999-46 -----


## preprocessing SFHH dataset

In [None]:
conf = pd.read_csv('datasets/SFHH-conf-sensor.edges', names=['person1', 'person2', 'time'])
conf['8min'] = conf['time'] // 480
unique_times = conf['8min'].unique()
conf_glist = [[] for _ in range(unique_times.shape[-1])]
for idx, time in enumerate(unique_times):
    print(f"----- {time} -----")
    edges = list(map(tuple, conf[conf['8min']==time][['person1', 'person2']].values))
    g = nx.Graph((x, y, {'weight': v}) for (x, y), v in Counter(edges).items())
    conf_glist[idx] = g
save_object(conf_glist, 'datasets/conf.pkl')

## preprocessing stackoverflow dataset

In [None]:
stackoverflow = pd.read_csv('datasets/sx-stackoverflow-a2q.txt', delimiter=' ', names=['from', 'to', 'time'])
stackoverflow['time'] = pd.to_datetime(stackoverflow['time'], unit='s')
stackoverflow = stackoverflow[(stackoverflow['time'] > '2015-12-08') & (stackoverflow['time'] <= '2015-12-13')]
stackoverflow['30min'] = stackoverflow['time'].dt.day.astype(str) + '-' + stackoverflow['time'].dt.hour.astype(str).str.zfill(2) + '-' + stackoverflow['time'].dt.minute.div(30).astype(int).astype(str).str.zfill(1)
unique_times = stackoverflow['30min'].unique()

stackoverflow_glist = []
for idx, time in enumerate(unique_times):
    print(f"----- {time} -----")
    edges = list(map(tuple, stackoverflow[stackoverflow['30min']==time][['from', 'to']].values))
    g = nx.DiGraph((x, y, {'weight': v}) for (x, y), v in Counter(edges).items())
    stackoverflow_glist.append(g)
save_object(stackoverflow_glist, 'datasets/stackoverflow.pkl')