In [9]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from pathlib import Path
import gzip
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix
import igraph as ig
import xnetwork as xn
import re
import spacy
import nltk
from nltk.corpus import stopwords
tqdm.pandas()

In [3]:
datasetName = "debate2024_Jun_bluesky"
dataPath = Path("Data")
networksPath = dataPath/"Networks"
networksPath.mkdir(parents=True, exist_ok=True)
# Minimum number of activities for a user to be considered
minUserActivities = 5

In [4]:
with gzip.open(dataPath / f"{datasetName}.feather.gz", 'rb') as f:
    df = pd.read_feather(f)

In [5]:
userActivityCount = df["user_id"].value_counts()
usersWithMinActivities = set(userActivityCount[userActivityCount >= minUserActivities].index)
dfFiltered = df[df["user_id"].isin(usersWithMinActivities)]

In [29]:

try:
    nlp = spacy.load('en_core_web_md')
except OSError:
    from spacy.cli import download
    download('en_core_web_md')
    nlp = spacy.load('en_core_web_md')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
# Load English Stop Words
stopword_set = set(stopwords.words('english'))

In [30]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [31]:
def tokenizePost(text, ngram_range=(1, 2)):

    # Cleaning text
    text = re.sub(r'https?://\S+|www\.\S+', " ", text)  # Remove URL
    # also filter urls that do not start with https:// or http://
    # anything that is recognized as a url
    

    text = re.sub(r'@\w+', ' ', text)  # Remove mentions
    text = re.sub(r'\d+', ' ', text)  # Remove digits
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = remove_emoji(text)  # Remove emoji
    text = re.sub(r'#\w+', ' ', text)  # Remove hashtags
    text = text.lstrip('RT')  # Remove RT word

    # Use spaCy to tokenize and lemmatize
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.lemma_.lower() not in stopword_set and not token.is_punct and not token.is_space]
    # if token < 4 characters, remove it
    tokens = [token for token in tokens if len(token) > 3]
    
    # Include n-grams of size defined by ngram_range
    ngrams = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngrams.extend([" ".join(tokens[i:i+n]).lower() for i in range(len(tokens) - n + 1)])
    return ngrams

In [32]:
def obtainBipartiteEdgesWords(df,removeReposts=True,removeReplies=False, ngramSize = 1):
    if "text" not in df or "post_type" not in df or "user_id" not in df:
        return []
    # drop all rows with missing text
    df = df.dropna(subset=["text"])
    if(removeReposts):
        df = df[df["post_type"] != "repost"]
    if(removeReplies):
        df = df[df["post_type"] != "reply"]

    # convert url strings that looks like lists to actual lists
    users = df["user_id"]
    textData = df["text"]

    tokens = df["text"].progress_apply(lambda x: tokenizePost(x,ngram_range=(1,ngramSize)))
    # keep only non-empty lists
    mask = tokens.apply(lambda x: len(x) > 0)
    tokens = tokens[mask]
    users = users[mask]
    # create edges list users -> hashtags
    edges = [(user,token) for user,token_list in zip(users,tokens) for token in token_list]
    return edges
  

In [33]:
bipartiteEdges = obtainBipartiteEdgesWords(dfFiltered)

  0%|          | 0/105747 [00:00<?, ?it/s]

In [34]:
def filterNodes(bipartiteEdges, minRightDegree=1, minLeftDegree=1):
    # goes from right to left
    bipartiteEdges = np.array(bipartiteEdges)
    mask = np.ones(len(bipartiteEdges),dtype=bool)
    if(minRightDegree>1):
        uniqueEdges = set(tuple(edge) for edge in bipartiteEdges)
        uniqueEdges = np.array(list(uniqueEdges))
        rightDegrees = Counter(uniqueEdges[:,1])
        mask &= np.array([rightDegrees[rightNode]>=minRightDegree for _,rightNode in bipartiteEdges])
    bipartiteEdges = bipartiteEdges[mask]
    
    # goes from left to right
    mask = np.ones(len(bipartiteEdges),dtype=bool)
    if(minLeftDegree>1):
        uniqueEdges = set(tuple(edge) for edge in bipartiteEdges)
        uniqueEdges = np.array(list(uniqueEdges))
        leftDegrees = Counter(uniqueEdges[:,0])
        mask &= np.array([leftDegrees[leftNode]>=minLeftDegree for leftNode,_ in bipartiteEdges])
    bipartiteEdges = bipartiteEdges[mask]
    return bipartiteEdges

In [35]:
bipartiteEdges = filterNodes(bipartiteEdges, minRightDegree=4, minLeftDegree=4)

In [36]:
bipartiteEdges = np.array(bipartiteEdges)
bipartiteIndexedEdges = np.zeros(bipartiteEdges.shape, dtype=int)
leftIndex2Label = [label for label in np.unique(bipartiteEdges[:,0])]
leftLabel2Index = {label: index for index, label in enumerate(leftIndex2Label)}
rightIndex2Label = [label for label in np.unique(bipartiteEdges[:,1])]
rightLabel2Index = {label: index for index, label in enumerate(rightIndex2Label)}

# create indexed edges in a numpy array integers
bipartiteIndexedEdges[:,0] = [leftLabel2Index[label] for label in bipartiteEdges[:,0]]
bipartiteIndexedEdges[:,1] = [rightLabel2Index[label] for label in bipartiteEdges[:,1]]

leftCount = len(leftIndex2Label)
rightCount = len(rightIndex2Label)

leftIndexedDegree = np.bincount(bipartiteIndexedEdges[:,0])
rightIndexedDegree = np.bincount(bipartiteIndexedEdges[:,1])


In [37]:
bipartiteIndexedEdges[0:5]

array([[8271, 5528],
       [8271, 1872],
       [8271, 6778],
       [8271,  172],
       [8271, 7185]])

In [38]:
weightsMatrix = csr_matrix((np.ones(len(bipartiteIndexedEdges)), (bipartiteIndexedEdges[:,0], bipartiteIndexedEdges[:,1])), shape=(leftCount, rightCount))
vectorizer = TfidfTransformer()
weightsMatrix = vectorizer.fit_transform(weightsMatrix)


In [39]:
similarities = cosine_similarity(weightsMatrix)

In [40]:
# Apply a threshold to the similarities
threshold = 0.3
similarities[similarities<threshold] = 0
# remove diagonal
np.fill_diagonal(similarities, 0)

In [41]:
g = ig.Graph.Weighted_Adjacency(similarities, mode="undirected", attr="weight")

In [42]:
{
 "Number of nodes":g.vcount(),
 "Number of edges":g.ecount(),
 "Avg. degree": 2.0*g.ecount()/g.vcount()
 }

{'Number of nodes': 8738,
 'Number of edges': 60077,
 'Avg. degree': 13.750743877317465}

In [43]:
g.vs["Label"] = leftIndex2Label
# original number of posts
g.vs["URLCount"] = leftIndexedDegree

In [44]:
# get top hashtags of each user according to tfidf
topWords= []
topWordsCount = 10
for row in weightsMatrix:
    topWords.append(", ".join([rightIndex2Label[index] for index in row.indices[np.argsort(row.data)][:-(topWordsCount+1):-1]]))
g.vs["TopWords"] = topWords


In [45]:
# remove isolated nodes
gFiltered = g.subgraph(g.vs.select(_degree_gt=0))

In [46]:
{
   "Number of nodes":gFiltered.vcount(),
   "Number of edges":gFiltered.ecount(),
   "Avg. degree": 2.0*gFiltered.ecount()/gFiltered.vcount()
}

{'Number of nodes': 2803,
 'Number of edges': 60077,
 'Avg. degree': 42.8662147698894}

In [47]:
# save to XNET
xn.save(gFiltered, networksPath/f"{datasetName}_coword.xnet")