In [None]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from pathlib import Path
import gzip
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix
import igraph as ig
import xnetwork as xn
import re
tqdm.pandas()

In [None]:
datasetName = "debate2024_Jun_bluesky"
dataPath = Path("Data")
networksPath = dataPath/"Networks"

In [None]:
with gzip.open(dataPath / f"{datasetName}.feather.gz", 'rb') as f:
    df = pd.read_feather(f)

In [None]:
g = xn.load(networksPath/f"{datasetName}_corepost.xnet")

In [None]:
g.vcount(), g.ecount()

In [None]:
g.strength(weights="weight")[:10]

In [None]:
g.vs.attributes()

In [None]:
g.vs["MostUniqueRepost"][:10]

In [None]:
gstrenghts = g.strength(weights="weight")
userHighestStrength = g.vs["Label"][np.argmax(gstrenghts)]
userHighestStrength

In [None]:
centrality = g.betweenness()
userHighestCentrality = g.vs["Label"][np.argmax(centrality)]
userHighestCentrality

In [None]:
# sample posts for user with highest strength
df[df["user_id"] == userHighestStrength].sample(10)["text"].values

In [None]:
# Most similar pair of users (high weight)
mostSimilarEdgeIndices = np.argsort(g.es["weight"])
edgeIndex = mostSimilarEdgeIndices[-3]
user1Index = g.es[edgeIndex].source
user2Index = g.es[edgeIndex].target

user1 = g.vs["Label"][user1Index]
user2 = g.vs["Label"][user2Index]
print(user1, user2, g.es[edgeIndex]["weight"])

In [None]:
# 5 samples from these users
display(df[df["user_id"] == user1].sample(5)["text"].values)
display(df[df["user_id"] == user2].sample(5)["text"].values)

In [None]:
g.vs["community"]=g.community_leiden().membership

In [None]:
topCommunities = Counter(g.vs["community"]).most_common(10)

In [None]:
# Sample posts from each top community
for community,_ in topCommunities:
    print(f"Community {community}")
    users = g.vs.select(lambda v: v["community"] == community)
    users = set([v["Label"] for v in users])
    print(users)
    posts = df[df["user_id"].isin(users)].sample(5)["text"].values
    print(posts)