In [3]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from pathlib import Path
import gzip
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer

from scipy.sparse import csr_matrix
import igraph as ig
import xnetwork as xn

In [4]:
datasetName = "debate2024_Jun_bluesky"
dataPath = Path("Data")
networksPath = dataPath/"Networks"
networksPath.mkdir(parents=True, exist_ok=True)
# Minimum number of activities for a user to be considered
minUserActivities = 5

In [5]:
with gzip.open(dataPath / f"{datasetName}.feather.gz", 'rb') as f:
    df = pd.read_feather(f)

In [6]:
userActivityCount = df["user_id"].value_counts()
usersWithMinActivities = set(userActivityCount[userActivityCount >= minUserActivities].index)
dfFiltered = df[df["user_id"].isin(usersWithMinActivities)]

In [7]:
def obtainBipartiteEdgesReposts(df):
    if "linked_post" not in df or "post_type" not in df or "user_id" not in df:
        return []
    df = df[df["post_type"] == "repost"]
    bipartiteEdges = df[["user_id","linked_post"]].values
    return bipartiteEdges

In [8]:
bipartiteEdges = obtainBipartiteEdgesReposts(dfFiltered)

In [9]:
def filterNodes(bipartiteEdges, minRightDegree=1, minLeftDegree=1):
    # goes from right to left
    bipartiteEdges = np.array(bipartiteEdges)
    mask = np.ones(len(bipartiteEdges),dtype=bool)
    if(minRightDegree>1):
        uniqueEdges = set(tuple(edge) for edge in bipartiteEdges)
        uniqueEdges = np.array(list(uniqueEdges))
        rightDegrees = Counter(uniqueEdges[:,1])
        mask &= np.array([rightDegrees[rightNode]>=minRightDegree for _,rightNode in bipartiteEdges])
    bipartiteEdges = bipartiteEdges[mask]
    
    # goes from left to right
    mask = np.ones(len(bipartiteEdges),dtype=bool)
    if(minLeftDegree>1):
        uniqueEdges = set(tuple(edge) for edge in bipartiteEdges)
        uniqueEdges = np.array(list(uniqueEdges))
        leftDegrees = Counter(uniqueEdges[:,0])
        mask &= np.array([leftDegrees[leftNode]>=minLeftDegree for leftNode,_ in bipartiteEdges])
    bipartiteEdges = bipartiteEdges[mask]
    return bipartiteEdges

In [10]:
bipartiteEdges = filterNodes(bipartiteEdges, minRightDegree=5, minLeftDegree=5)

In [11]:
bipartiteEdges = np.array(bipartiteEdges)
bipartiteIndexedEdges = np.zeros(bipartiteEdges.shape, dtype=int)
leftIndex2Label = [label for label in np.unique(bipartiteEdges[:,0])]
leftLabel2Index = {label: index for index, label in enumerate(leftIndex2Label)}
rightIndex2Label = [label for label in np.unique(bipartiteEdges[:,1])]
rightLabel2Index = {label: index for index, label in enumerate(rightIndex2Label)}

# create indexed edges in a numpy array integers
bipartiteIndexedEdges[:,0] = [leftLabel2Index[label] for label in bipartiteEdges[:,0]]
bipartiteIndexedEdges[:,1] = [rightLabel2Index[label] for label in bipartiteEdges[:,1]]

leftCount = len(leftIndex2Label)
rightCount = len(rightIndex2Label)

leftIndexedDegree = np.bincount(bipartiteIndexedEdges[:,0])
rightIndexedDegree = np.bincount(bipartiteIndexedEdges[:,1])


In [12]:
bipartiteIndexedEdges[0:5]

array([[4513, 4469],
       [2546, 4469],
       [1818, 1461],
       [4269, 1461],
       [4172, 1868]])

In [13]:
weightsMatrix = csr_matrix((np.ones(len(bipartiteIndexedEdges)), (bipartiteIndexedEdges[:,0], bipartiteIndexedEdges[:,1])), shape=(leftCount, rightCount))
vectorizer = TfidfTransformer()
weightsMatrix = vectorizer.fit_transform(weightsMatrix)


In [14]:
similarities = cosine_similarity(weightsMatrix)

In [15]:
# Apply a threshold to the similarities
threshold = 0.1
similarities[similarities<threshold] = 0
# remove diagonal
np.fill_diagonal(similarities, 0)

In [16]:
g = ig.Graph.Weighted_Adjacency(similarities, mode="undirected", attr="weight")

In [17]:
{
 "Number of nodes":g.vcount(),
 "Number of edges":g.ecount(),
 "Avg. degree": 2.0*g.ecount()/g.vcount()
 }

{'Number of nodes': 4808,
 'Number of edges': 173448,
 'Avg. degree': 72.14975041597337}

In [27]:
g.vs["Label"] = leftIndex2Label
# original number of posts
g.vs["RepostsCount"] = leftIndexedDegree

In [28]:
postID2Text = dict(zip(df["post_id"], df["text"]))
topRepostIndices = np.argmax(weightsMatrix, axis=1)
g.vs["MostUniqueRepost"] = [postID2Text[rightIndex2Label[index]] for index in topRepostIndices.A.ravel()]

In [29]:
user2repostIDs = dfFiltered[dfFiltered["post_type"]=="repost"].groupby("user_id")["linked_post"].apply(list)
user2samplePostText = user2repostIDs.apply(lambda repostIDs: "<br>".join(np.random.choice([postID2Text[postID] for postID in repostIDs], size=min(len(repostIDs),3), replace=False)))
g.vs["SampleReposts"] = [user2samplePostText.get(label, "") for label in leftIndex2Label]

In [30]:
# remove isolated nodes
gFiltered = g.subgraph(g.vs.select(_degree_gt=0))

In [31]:
# save to XNET
xn.save(g, networksPath/f"{datasetName}_corepost.xnet")