In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df_en = pd.read_csv("data/en_duplicates_chunks_averagerep.csv")

In [None]:
import re
import emoji

# Load your DataFrame (assuming df is already loaded)
# Step 1: Filter group_duplicate == 136
df_1 = df_en[df_en["group_rank"] == 1]

# Step 2: Random sample of 100 tweets
sample_df = df_1.sample(n=100, random_state=42)

def clean_and_tokenize(text):
    # Convert to string if not already
    text = str(text)
    
    # Remove emojis
    text = emoji.replace_emoji(text, replace='')
    
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r"@\w+", '', text) 
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'#مهسا_امینی\b', '', text)
    
    # Remove non-alphabetic characters (keeping spaces)
    # Adjust this based on whether you want to keep numbers or other characters
    #text = re.sub(r"[^a-zA-Zà-ÿء-يآ\s]", '', text)
    
    # Remove extra whitespace (multiple spaces, tabs, newlines)
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading/trailing spaces
    text = text.strip()
    
    tokens = text.split()
    return tokens

# Apply the function
sample_df["tweet_clean_graph"] = sample_df["tweet"].apply(clean_and_tokenize)

In [None]:
from pyvis.network import Network
import networkx as nx
from collections import defaultdict

tweets = sample_df["tweet_clean_graph"].tolist()
# Step 2: Build directed co-occurrence graph
co_occur = defaultdict(int)
for tweet in tweets:
    words = tweet
    for i in range(len(words) - 1):
        word1, word2 = words[i], words[i + 1]
        co_occur[(word1, word2)] += 1

# Step 3: Create directed graph G
G = nx.DiGraph()
for (word1, word2), weight in co_occur.items():
    G.add_edge(word1, word2, weight=weight)

In [None]:
core_words = ["22", "twenty", "two", "twenty-two", "twentytwo", "years", "old", "yearsold", "yearold", "year", "yo", "iranian", "girl", "is", "brutally", "killed", "by", "sharia", "police", "in", "iran", "iran."]

def classify_word(word):
    if re.match(r"@[\w_]+", word):
        return "mention"
    elif re.match(r"#\w+", word):
        return "hashtag"
    elif re.search(r"[اآبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]", word):  # contains Persian chars
        return "persian"
    elif word in core_words:
        return "core"
    else:
        return "other"

# Define color map
color_map = {
    "core": "#E74C3C",       # red
    "mention": "#3498DB",    # blue
    "hashtag": "#F1C40F",    # yellow
    "persian": "#9B59B6",    # purple
    "other": "#BDC3C7"       # light gray
}

In [None]:
# Step 4: Assign node attributes for Gephi
for node in G.nodes():
    category = classify_word(node)
    G.nodes[node]['category'] = category  # useful for Gephi coloring
    G.nodes[node]['label'] = node         # ensures node labels are visible in Gephi
    G.nodes[node]['degree'] = G.degree(node)  # size nodes by degree if desired

# Step 5: Export to GEXF format for Gephi
nx.write_gexf(G, "figures/cooccurrence_graph_gephi_100.gexf")