In [103]:
import networkx as nx
import pandas as pd
import numpy as np
from os import listdir
from networkx.readwrite.edgelist import read_edgelist
from networkx.convert_matrix import to_scipy_sparse_matrix

from spektral.data import Dataset
from spektral.data.graph import Graph

In [3]:
class WICO(Dataset):
    def __init__(self, path="./dataset/WICO/"):
        super().__init()
        self.path = path
        self.labels = {x: i for i, x in enumerate(listdir(self.path))}

    def read(self):
        for y,graph_type in enumerate(listdir(self.path)):
            subgraphs_list = list(filter(str.isnumeric, listdir(f"{self.path}/{graph_type}/")))
            for graph_id in subgraphs_list:
                pass

In [100]:
path = "./dataset/WICO/5G_Conspiracy_Graphs/1"

def load_edge_list(filename):
    edges = []

    with open(filename, "r") as f:
        for line in f:
            u,v = line.split()
            edges.append((int(u), int(v)))
    
    return edges

def load_graph_from_file(pathname, root_edges=False, time_delay_edges=False, hours=1):
    """[summary]

    Args:
        pathname ([type]): [description]
        root_edges (bool, optional): [description]. Defaults to False.
        time_delay_edges (bool, optional): [description]. Defaults to False.

    Returns:
        [type]: [description]
    """
    edge_list = load_edge_list(f"{pathname}/edges.txt")

    if(len(edge_list) == 0):
        return None

    G = nx.from_edgelist(edge_list, create_using=nx.DiGraph)

    df = pd.read_csv(f"{pathname}/nodes.csv")

    if root_edges:
        # add edges from root tweet to all other nodes
        root_id = df.sort_values("time").iloc[0,0]
        new_edges = [(root_id, x) for x in G if x != root_id]
        G.add_edges_from(new_edges)
    
    if time_delay_edges:
        # add edges from root to tweet < 10 hours from root tweet
        df["time"] = (df["time"] / 3600).astype(float)
        user_ids = df[df["time"] < hours]["id"].tolist()
        new_edges = [(root_id, x) for x in user_ids]

    return G

def load_users_info(path):
    """[summary]

    Returns:
        [type]: [description]
    """
    user_tweet_count = defaultdict(int)
    user_info = defaultdict()

    for graph_type in listdir(path):
        pathname = f"{path}/{graph_type}/"
        subgraphs_list = list(map(int, filter(str.isnumeric, listdir(pathname))))

        for graph_id in sorted(subgraphs_list)[:50]:
            df = pd.read_csv(f"{pathname}/{graph_id}/nodes.csv")
            
            for _, user_id, _, friends, followers in df.itertuples():
                user_tweet_count[user_id] += 1
                user_info[user_id] = (friends, followers)

    return user_info, user_tweet_count

In [107]:
path = "./dataset/WICO"
hours = 1

user_info, user_tweet_count = load_users_info("./dataset/WICO/")

graph_spektral_list = []
graph_nx_list = []

for y,graph_type in enumerate(listdir(path)):
    pathname = f"{path}/{graph_type}/"
    subgraphs_list = list(map(int, filter(str.isnumeric, listdir(pathname))))

    for graph_id in sorted(subgraphs_list)[:50]:
        G = load_graph_from_file(f"{pathname}/{graph_id}",
                                 root_edges=True, time_delay_edges=True)

        A = to_scipy_sparse_matrix(G)

        # build node features vectors
        df = pd.read_csv(f"{pathname}/{graph_id}/nodes.csv")

        # features are (followers, friends, number of tweets, tweet_timestamp)
        x = np.zeros((len(df), 4), dtype=int)

        for i, user_id, tweet_time, friends, followers in df.itertuples():
            friends, followers = user_info[user_id]
            n_tweets = user_tweet_count[user_id]

            x[i] = [friends, followers, n_tweets, tweet_time]

        graph_spektral_list.append(Graph(a=A, x=x, y=y))
        graph_nx_list.append(G)