# Dataset Network Analysis

This notebook analyzes the network of keyword co-occurrences in the collected Bluesky posts dataset. 
It loads the data, extracts keywords, builds a co-occurrence graph, calculates network metrics, and visualizes the structure.

In [None]:
import os
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from itertools import combinations
from networkx.algorithms import community
from matplotlib.lines import Line2D
from tqdm.notebook import tqdm

# Display settings
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 12)

In [None]:
# --- CONFIGURATION ---
INPUT_DIR = "./data"
OUTPUT_BASE_DIR = "./analysis_results"
MIN_CO_OCCURRENCES = 1  # Minimum co-occurrences to create an edge

# Ensure output directory exists
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

In [None]:
def load_dataset(filename):
    filepath = os.path.join(INPUT_DIR, filename)
    if not os.path.exists(filepath):
        print(f"File not found: {filepath}")
        return None
    
    print(f"Loading {filepath}...")
    df = pd.read_csv(filepath)
    print(f"Loaded {len(df)} posts.")
    return df

In [None]:
def analyze_network(df, config_name):
    print(f"\n{'='*80}")
    print(f"ANALYZING: {config_name}")
    print(f"{'='*80}")

    # 1. Extract Keywords
    if 'keyword' not in df.columns:
        print("Error: 'keyword' column missing.")
        return
        
    keywords = df['keyword'].dropna().unique().tolist()
    print(f"Unique Keywords found: {len(keywords)}")
    
    # Create specific output directory
    out_dir = os.path.join(OUTPUT_BASE_DIR, config_name)
    os.makedirs(out_dir, exist_ok=True)

    # 2. Compute Co-occurrences
    print("Computing co-occurrences...")
    co_occurrence_data = []
    
    # Pre-calculate text presence for speed
    # Create a dictionary of sets mapping keyword -> set of indices where it appears
    # This is faster than repeated str.contains
    keyword_indices = {}
    for kw in tqdm(keywords, desc="Indexing keywords"):
        # Case-insensitive matching
        mask = df['text'].str.contains(kw, case=False, na=False, regex=False)
        keyword_indices[kw] = set(df[mask].index)
        
    for kw1, kw2 in tqdm(combinations(keywords, 2), total=len(keywords)*(len(keywords)-1)//2, desc="Calculating Pairs"):
        # Intersection of indices
        common_indices = keyword_indices[kw1].intersection(keyword_indices[kw2])
        count = len(common_indices)

        if count >= MIN_CO_OCCURRENCES:
            co_occurrence_data.append({
                'w1': kw1,
                'w2': kw2,
                'count': count
            })
            
    co_df = pd.DataFrame(co_occurrence_data)
    print(f"Found {len(co_df)} pairs with co-occurrences >= {MIN_CO_OCCURRENCES}")
    
    if len(co_df) == 0:
        print("No co-occurrences found. Skipping graph build.")
        return

    # 3. Build Graph
    G = nx.Graph()
    G.add_nodes_from(keywords)
    
    edges = []
    for _, row in co_df.iterrows():
        edges.append((row['w1'], row['w2'], {'weight': row['count']}))
    G.add_edges_from(edges)
    
    print(f"Graph constructed: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges.")
    
    # 4. Calculate Metrics
    degrees = dict(G.degree())
    degree_values = list(degrees.values())
    strength = dict(G.degree(weight='weight'))
    betweenness = nx.betweenness_centrality(G, weight='weight')
    
    if nx.is_connected(G):
        closeness = nx.closeness_centrality(G, distance='weight')
    else:
        # Handle disconnected graph for closeness
        closeness = {}
        for c in nx.connected_components(G):
            subg = G.subgraph(c)
            closeness.update(nx.closeness_centrality(subg, distance='weight'))
            
    clustering = nx.clustering(G)
    
    # Community Detection
    communities = []
    modularity = 0
    if G.number_of_edges() > 0:
        communities = list(community.greedy_modularity_communities(G, weight='weight'))
        modularity = community.modularity(G, communities, weight='weight')
    
    # Save Metrics
    metrics_df = pd.DataFrame({
        'keyword': keywords,
        'degree': [degrees.get(k,0) for k in keywords],
        'strength': [strength.get(k,0) for k in keywords],
        'betweenness': [betweenness.get(k,0) for k in keywords],
        'closeness': [closeness.get(k,0) for k in keywords],
        'clustering': [clustering.get(k,0) for k in keywords]
    }).sort_values('degree', ascending=False)
    
    metrics_df.to_csv(os.path.join(out_dir, "node_metrics.csv"), index=False)
    print("Metrics saved to node_metrics.csv")

    # 5. Visualizations
    visualize_network(G, degrees, edge_weights=[d['weight'] for u,v,d in G.edges(data=True)], 
                      communities=communities, out_dir=out_dir, title=config_name)
    
    return G, metrics_df

In [None]:
def visualize_network(G, degrees, edge_weights, communities, out_dir, title):
    # Node Sizes
    node_sizes = [degrees[n] * 100 + 100 for n in G.nodes()]
    
    # Layout
    pos = nx.spring_layout(G, k=0.5, seed=42)
    
    # 1. Main Network Plot
    plt.figure(figsize=(16, 12))
    
    # Color map for communities if available
    node_colors = 'skyblue'
    if communities:
        color_map = {}
        cmap = plt.get_cmap('tab20')
        for i, comm in enumerate(communities):
            for node in comm:
                color_map[node] = cmap(i % 20)
        node_colors = [color_map.get(n, 'lightgray') for n in G.nodes()]
    
    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=node_colors, alpha=0.9, edgecolors='white')
    
    # Normalize edge widths
    if edge_weights:
        max_w = max(edge_weights)
        widths = [1 + (w/max_w)*5 for w in edge_weights]
        nx.draw_networkx_edges(G, pos, width=widths, alpha=0.4, edge_color='gray')
    
    nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
    
    plt.title(f"Keyword Network: {title}", fontsize=16)
    plt.axis('off')
    plt.savefig(os.path.join(out_dir, "network_graph.png"), bbox_inches='tight')
    plt.show()
    
    # 2. Degree Distribution Histogram
    plt.figure(figsize=(10, 6))
    plt.hist(list(degrees.values()), bins=20, color='steelblue', edgecolor='black')
    plt.title("Degree Distribution")
    plt.xlabel("Degree")
    plt.ylabel("Frequency")
    plt.grid(axis='y', alpha=0.3)
    plt.savefig(os.path.join(out_dir, "degree_dist.png"))
    plt.show()
    
    # 3. Adjacency Heatmap
    plt.figure(figsize=(12, 10))
    adj_matrix = nx.to_pandas_adjacency(G, weight='weight')
    plt.imshow(adj_matrix, cmap='Blues', interpolation='nearest')
    plt.colorbar(label='Co-occurrence Count')
    plt.xticks(range(len(adj_matrix)), adj_matrix.columns, rotation=90)
    plt.yticks(range(len(adj_matrix)), adj_matrix.index)
    plt.title("Adjacency Heatmap")
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, "heatmap.png"))
    plt.show()

In [None]:
# Execute Analysis for all CSV files in data directory
count = 0
if os.path.exists(INPUT_DIR):
    for filename in os.listdir(INPUT_DIR):
        if filename.endswith(".csv") and "metrics" not in filename:
            config_name = filename.replace(".csv", "")
            df = load_dataset(filename)
            if df is not None:
                G, metrics = analyze_network(df, config_name)
                count += 1
else:
    print(f"Directory {INPUT_DIR} does not exist.")
    
if count == 0:
    print("No CSV files found to analyze.")