In [None]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from pathlib import Path
import gzip
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix
import igraph as ig
import xnetwork as xn

In [None]:
datasetName = "debate2024_Jun_mastodon"
dataPath = Path("Data")
networksPath = dataPath/"Networks"
networksPath.mkdir(parents=True, exist_ok=True)
# Minimum number of activities for a user to be considered
minUserActivities = 5

In [None]:
with gzip.open(dataPath / f"{datasetName}.feather.gz", 'rb') as f:
    df = pd.read_feather(f)

In [None]:
userActivityCount = df["user_id"].value_counts()
usersWithMinActivities = set(userActivityCount[userActivityCount >= minUserActivities].index)
dfFiltered = df[df["user_id"].isin(usersWithMinActivities)]

In [None]:
def obtainBipartiteEdgesHashtags(df,removeRetweets=True,removeReplies=False):
    if "hashtags" not in df or "post_type" not in df or "user_id" not in df:
        return []
    
    if(removeRetweets):
        df = df[df["post_type"] != "repost"]
    if(removeReplies):
        df = df[df["post_type"] != "reply"]

    # convert url strings that looks like lists to actual lists
    users = df["user_id"]
    hashtags = df["hashtags"].apply(lambda hashtagList: [hashtag.lower().strip() for hashtag in hashtagList])
    # keep only non-empty lists
    mask = hashtags.apply(lambda x: len(x) > 0)
    hashtags = hashtags[mask]
    users = users[mask]
    # create edges list users -> hashtags
    edges = [(user,hashtag) for user,hashtag_list in zip(users,hashtags) for hashtag in hashtag_list]
    return edges
  

In [None]:
bipartiteEdges = obtainBipartiteEdgesHashtags(dfFiltered)

In [None]:
def filterNodes(bipartiteEdges, minRightDegree=1, minLeftDegree=1):
    # goes from right to left
    bipartiteEdges = np.array(bipartiteEdges)
    mask = np.ones(len(bipartiteEdges),dtype=bool)
    if(minRightDegree>1):
        uniqueEdges = set(tuple(edge) for edge in bipartiteEdges)
        uniqueEdges = np.array(list(uniqueEdges))
        rightDegrees = Counter(uniqueEdges[:,1])
        mask &= np.array([rightDegrees[rightNode]>=minRightDegree for _,rightNode in bipartiteEdges])
    bipartiteEdges = bipartiteEdges[mask]
    
    # goes from left to right
    mask = np.ones(len(bipartiteEdges),dtype=bool)
    if(minLeftDegree>1):
        uniqueEdges = set(tuple(edge) for edge in bipartiteEdges)
        uniqueEdges = np.array(list(uniqueEdges))
        leftDegrees = Counter(uniqueEdges[:,0])
        mask &= np.array([leftDegrees[leftNode]>=minLeftDegree for leftNode,_ in bipartiteEdges])
    bipartiteEdges = bipartiteEdges[mask]
    return bipartiteEdges

In [None]:
bipartiteEdges = filterNodes(bipartiteEdges, minRightDegree=4, minLeftDegree=4)

In [None]:
# invert left and right for cooccurrence graph
bipartiteEdges = [(right,left) for left,right in bipartiteEdges]

In [None]:
bipartiteEdges = np.array(bipartiteEdges)
bipartiteIndexedEdges = np.zeros(bipartiteEdges.shape, dtype=int)
leftIndex2Label = [label for label in np.unique(bipartiteEdges[:,0])]
leftLabel2Index = {label: index for index, label in enumerate(leftIndex2Label)}
rightIndex2Label = [label for label in np.unique(bipartiteEdges[:,1])]
rightLabel2Index = {label: index for index, label in enumerate(rightIndex2Label)}

# create indexed edges in a numpy array integers
bipartiteIndexedEdges[:,0] = [leftLabel2Index[label] for label in bipartiteEdges[:,0]]
bipartiteIndexedEdges[:,1] = [rightLabel2Index[label] for label in bipartiteEdges[:,1]]

leftCount = len(leftIndex2Label)
rightCount = len(rightIndex2Label)

leftIndexedDegree = np.bincount(bipartiteIndexedEdges[:,0])
rightIndexedDegree = np.bincount(bipartiteIndexedEdges[:,1])


In [None]:
bipartiteIndexedEdges[0:5]

In [None]:
weightsMatrix = csr_matrix((np.ones(len(bipartiteIndexedEdges)), (bipartiteIndexedEdges[:,0], bipartiteIndexedEdges[:,1])), shape=(leftCount, rightCount))
vectorizer = TfidfTransformer()
# weightsMatrix = vectorizer.fit_transform(weightsMatrix)


In [None]:
similarities = cosine_similarity(weightsMatrix)

In [None]:
# Apply a threshold to the similarities
threshold = 0.4
similarities[similarities<threshold] = 0
# remove diagonal
np.fill_diagonal(similarities, 0)

In [None]:
g = ig.Graph.Weighted_Adjacency(similarities, mode="undirected", attr="weight")

In [None]:
{
 "Number of nodes":g.vcount(),
 "Number of edges":g.ecount(),
 "Avg. degree": 2.0*g.ecount()/g.vcount()
 }

In [None]:
g.vs["Label"] = leftIndex2Label
# original number of posts
g.vs["UserCount"] = leftIndexedDegree

In [None]:
# remove isolated nodes
gFiltered = g.subgraph(g.vs.select(_degree_gt=0))

In [None]:
{
   "Number of nodes":gFiltered.vcount(),
   "Number of edges":gFiltered.ecount(),
   "Avg. degree": 2.0*gFiltered.ecount()/gFiltered.vcount()
}

In [None]:
# save to XNET
xn.save(gFiltered, networksPath/f"{datasetName}_hashtag_cooccurence.xnet")

In [None]:
# save largest connected component
gLCC = gFiltered.components().giant()
xn.save(gLCC, networksPath/f"{datasetName}_hashtag_cooccurence_giant.xnet")