In [None]:
from typing import Iterable, Literal
import pandas as pd

import networkx as nx

## Data preprocessing

Considering the [wiki-meta](https://snap.stanford.edu/data/wiki-meta.html) (user-talk) dataset, we identified 10354576 entries available for preprocessing, 1275002 (around 12%) of them is bot-generated traffic. This left us with 10354577 entries, a number of which (to be identified) are self-links that also need to be removed, as they are uninformative about the interactions between users.

## Filter nodes

In [None]:
def filter_nodes(data: pd.DataFrame, nodes: Iterable[int]) -> pd.DataFrame:
    """
    Filter the DataFrame to include only the specified nodes.
    """
    return data[data["source"].isin(nodes) | data["target"].isin(nodes)]

## Graph building

In [None]:
def build_social_graph(data: pd.DataFrame) -> nx.Graph:
    """
    Build a social graph from the DataFrame.
    """
    G = nx.Graph()
    for _, row in data.iterrows():
        G.add_edge(row["source"], row["target"])
    return G


def build_activity_graph(
    data: pd.DataFrame, interaction: Literal["all", "RT", "MT", "RE"] = "all"
) -> nx.Graph:
    """
    Build an activity graph from the DataFrame.
    """
    G = nx.Graph()
    for _, row in data.iterrows():
        if interaction == "all":
            # Add the edge with interaction as attribute
            G.add_edge(row["source"], row["target"], interaction=row["interaction"])
        elif row["interaction"] == interaction:
            # Add the edge
            G.add_edge(row["source"], row["target"])
    return G