# Graph Analysis

## Description
Analyse:
- Topic detection metrics (Modularity, Ncut, NMI) across methods (BERTopic, Louvain, Leiden)
- Sentiment across topics ('politic', 'mental', 'health', 'emotion', 'tone', 'agency')

## Setup

### Libraries (Python)

In [1]:
import igraph as ig
import json
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import sys
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Directories

In [2]:
try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    BASE_DIR = Path().resolve()

SRC_DIR = BASE_DIR / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

DATA_DIR = BASE_DIR / 'data'
DATA_DIR.mkdir(exist_ok=True)
POSTS_DIR = DATA_DIR / 'posts'
POSTS_ALL_DIR = POSTS_DIR / 'all'
POSTS_FILTERED_DIR = POSTS_DIR / 'filtered'
POSTS_FILTERED_CLEAN_DIR = POSTS_FILTERED_DIR / 'clean'
COMMENTS_DIR = DATA_DIR / 'comments'
COMMENTS_CLEAN_DIR = COMMENTS_DIR / 'clean'
RESULTS_DIR = BASE_DIR / 'results'
RESULTS_GRAPHS_DIR = RESULTS_DIR / 'graphs'
RESULTS_MODELS_DIR = RESULTS_DIR / 'models'
RESULTS_MODELS_DIR.mkdir(exist_ok=True)
RESULTS_PLOT_DIR = RESULTS_DIR / 'plots' 
RESULTS_MODELS_FILE = RESULTS_MODELS_DIR / 'model_results.csv'

### Libraries (Custom)

## Import Data

In [3]:
# Graph
filename = RESULTS_GRAPHS_DIR / 'topics_graph.graphml'
g_nx = nx.read_graphml(str(filename))
g = ig.Graph.from_networkx(g_nx)

print(g.summary())

IGRAPH U-W- 16 101 -- 
+ attr: edge_default (g), node_default (g), _nx_name (v), agency (v), emotion (v), health (v), leiden_topic (v), mental (v), node_strength (v), pagerank_unweighted (v), pagerank_weighted (v), politic (v), tone (v), topic_label (v), weight (e)


In [37]:
# Graph df
filename_topic_df = DATA_DIR / 'topic_df.csv'
topic_df = pd.read_csv(filename_topic_df)
col_to_rename={'leiden_topic': 'Leiden Topic', 
               'politic': 'LIWC-Politic',
               'mental': 'LIWC-Mental',
               'health': 'LIWC-Health',
               'emotion': 'Emotion',
               'tone': 'Tone',
               'agency': 'Agency',
               'pagerank_weighted': 'PageRank Score'}
topic_df.rename(columns=col_to_rename, inplace=True)
topic_df = topic_df[~topic_df['topic_label'].str.contains("Outliers", na=False)]

In [5]:
# Metrics df
metrics_df = pd.read_csv(RESULTS_MODELS_FILE)
metrics_df.rename(columns={'Unnamed: 0': 'Metric'}, inplace=True)

## Analyses

In [40]:
# Columns to average (excluding identifiers and the weighting column itself)
cols_to_avg = [col for col in topic_df.columns if col not in ['Leiden Topic', 'topic_label', 'PageRank Score']]

# Calculate weighted average for each column using PageRank Score as weights
weighted_avg_values = {}
weights = topic_df['PageRank Score']
for col in cols_to_avg:
    weighted_avg_values[col] = np.average(topic_df[col], weights=weights)

# Construct new row dictionary
new_row = {'Leiden Topic': 16, 'topic_label': 'Weighted Avg', 'PageRank Score': weights.mean()}
new_row.update(weighted_avg_values)

# Concat new row to DataFrame
new_row_df = pd.DataFrame([new_row])
topic_df = pd.concat([topic_df, new_row_df], ignore_index=True)

### Metrics

In [None]:
# Plot results
models = metrics_df.columns[1:]
metrics = metrics_df['Metric'].tolist()

# Define a distinct color for each model
colors = {
    'BERTopic': '#d55a04',
    'BERTopic_reduced': '#e69f00',
    'Louvain': '#0071b2',
    'Leiden': '#56b3e9'
}

fig = make_subplots(rows=1, cols=3, subplot_titles=metrics)

for i, metric in enumerate(metrics, start=1):
    row = metrics_df[metrics_df['Metric'] == metric]
    
    for model in models:
        score = row[model].values[0]
        fig.add_trace(
            go.Bar(
                x=[model], 
                y=[score], 
                name=model,
                marker_color=colors[model],
                text=[f"{score:.3f}"],
                textposition='outside',
                showlegend=(i == 1)
            ),
            row=1, col=i
        )

        
    # Set y-axis range per subplot
    fig.update_yaxes(range=[0,1], row=1, col=i)

fig.update_layout(
    height=400,
    width=900, 
    title_text="Model Scores per Metric",
    barmode='group'
)

fig.show()

### Topic Sentiment

In [46]:
cols_to_plot = ['LIWC-Politic', 'LIWC-Mental', 'LIWC-Health', 'Emotion', 'Agency']

max_val = topic_df[cols_to_plot].max().max()
y_range = [-1.25, max(max_val, 2) * 1.1]

blue, vermillion = '#0071b2', '#d55a04'

for topic in topic_df['Leiden Topic']:
    if topic != 16:
        data = topic_df.loc[topic_df['Leiden Topic'] == topic, cols_to_plot].iloc[0]
        df_plot = data.reset_index()
        df_plot.columns = ['feature', 'value']
        
        # Assign hex colors
        df_plot['color'] = df_plot['value'].apply(lambda x: blue if x >= 0 else vermillion)

        y_labels = [label for label in cols_to_plot]

        df_plot['text_label'] = df_plot['value'].round(2).astype(str)

        fig = px.bar(
            df_plot, y='feature', x='value',
            labels={'value': 'Value'},
            range_x=y_range,
            color='color',
            color_discrete_sequence=[blue, vermillion],
            text='text_label'
        )

        fig.update_traces(orientation='h', showlegend=False, textposition='outside', textfont_color='white')

        fig.update_yaxes(
            categoryorder='array',
            categoryarray=cols_to_plot,
            tickvals=cols_to_plot,
            ticktext=y_labels
        )
        
        fig.update_layout(
            font=dict(color='white'),
            xaxis=dict(
                dtick=0.5,
                color='white'
            ),
            yaxis=dict(
                color='white'
            ),
            title=dict(
                text=f'{topic_df.loc[topic_df["Leiden Topic"] == topic, "topic_label"].values[0]}',
                x=0.5,
                xanchor='center',
                font=dict(color='white')
            ),
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            yaxis_title='',
            xaxis_title='',
            height=300,
            width=600,
            margin=dict(l=120, r=20, t=50, b=40)
        )
    else:
        cols_to_plot.append('Tone')
        data = topic_df.loc[topic_df['Leiden Topic'] == topic, cols_to_plot].iloc[0]
        df_plot = data.reset_index()
        df_plot.columns = ['feature', 'value']
        
        # Assign hex colors
        df_plot['color'] = df_plot['value'].apply(lambda x: blue if x >= 0 else vermillion)

        y_labels = [label for label in cols_to_plot]

        df_plot['text_label'] = df_plot['value'].round(2).astype(str)

        fig = px.bar(
            df_plot, y='feature', x='value',
            labels={'value': 'Value'},
            range_x=y_range,
            color='color',
            color_discrete_sequence=[blue, vermillion],
            text='text_label'
        )

        fig.update_traces(orientation='h', showlegend=False, textposition='outside', textfont_color='white')

        fig.update_yaxes(
            categoryorder='array',
            categoryarray=cols_to_plot,
            tickvals=cols_to_plot,
            ticktext=y_labels
        )
        
        fig.update_layout(
            font=dict(color='white'),
            xaxis=dict(
                dtick=0.5,
                color='white'
            ),
            yaxis=dict(
                color='white'
            ),
            title=dict(
                text=f'{topic_df.loc[topic_df["Leiden Topic"] == topic, "topic_label"].values[0]}',
                x=0.5,
                xanchor='center',
                font=dict(color='white')
            ),
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            yaxis_title='',
            xaxis_title='',
            height=300,
            width=600,
            margin=dict(l=120, r=20, t=50, b=40)
        )


    filename = RESULTS_PLOT_DIR / 'sentiment' / f"topic_{topic}.png"
    fig.write_image(filename)
    fig.show()

In [67]:
# Melt to long format for plotting
melted_df = topic_df.melt(
    id_vars=["LIWC-Politic", "PageRank Score"],
    value_vars=["LIWC-Mental", "LIWC-Health", "Emotion", "Agency", "Tone"],
    var_name="Variable",
    value_name="Value"
)

# Create scatter plot with log-scaled x-axis
fig = px.scatter(
    melted_df,
    x="LIWC-Politic",
    y="Value",
    color="Variable",
    size="PageRank Score",
    trendline="ols",
    labels={
        "LIWC-Politic": "LIWC-Politic",
        "Value": "Score",
        "Variable": "LIWC Variable",
        "PageRank Score": "PageRank Score"
    },
    title="Sentiment Analysis Variables as Function of LIWC-Politic",
    height=600
)

fig.update_layout(legend_title="Variable", height=500, width=750)
filename_trend = RESULTS_PLOT_DIR / 'sentiment' / f"trend_by_politic.png"
fig.write_image(filename)
fig.show()