In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score
import warnings
import os

# Set style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams.update({
    'font.family': 'serif',
    'font.size': 11,
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'figure.titlesize': 14
})
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# Output directory
output_dir = r"C:\Users\pavle\OneDrive\Desktop\my github\master-thesis\figures"
os.makedirs(output_dir, exist_ok=True)

# PART 1: Topic Classification Evaluation

Evaluating our topic classification against ParlaCAP predictions and human labels.

In [None]:
BASE_DATA_DIR = r"data folder"

GB = pd.read_pickle(os.path.join(BASE_DATA_DIR, "GB/GB_final.pkl"))
AT = pd.read_pickle(os.path.join(BASE_DATA_DIR, "AT/AT_final.pkl"))
HR = pd.read_pickle(os.path.join(BASE_DATA_DIR, "HR/HR_final.pkl"))

print(f"   AT={AT.shape}, HR={HR.shape}, GB={GB.shape}")

In [None]:
#HR['topic_consensus'].value_counts()

HR['CAP_Category_HR_croatian'].value_counts()

## Confusion Matrix Analysis

In [None]:
def analyze_labels(df, evaluation_col, true_label_col, sample_size='all', min_word_count=None,
                   exclude_roles=None, exclude_topics=None, text_column='Text_English', 
                   plot_size=(16, 12), dataset_name="Dataset"):
    """Generate confusion matrix and metrics"""
    
    exclude_roles = exclude_roles or []
    exclude_topics = exclude_topics or []
    
    # Filter
    df = df[~df[true_label_col].isin(['-'])].dropna(subset=[evaluation_col, true_label_col])
    
    if exclude_roles:
        df = df[~df['Speaker_role'].isin(exclude_roles)]
    if exclude_topics:
        df = df[~df[evaluation_col].isin(exclude_topics)]
        df = df[~df[true_label_col].isin(exclude_topics)]
    if min_word_count:
        df = df[df[text_column].apply(lambda x: len(str(x).split())) >= min_word_count]
    
    # Sample
    if sample_size != 'all':
        sampled = []
        for cat in df[true_label_col].unique():
            cat_df = df[df[true_label_col] == cat]
            sampled.append(cat_df.sample(n=min(sample_size, len(cat_df))))
        df = pd.concat(sampled, ignore_index=True)
    
    # Metrics
    y_true = df[true_label_col]
    y_pred = df[evaluation_col]
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)
    
    # Plot
    plt.figure(figsize=plot_size)
    categories = y_true.value_counts().index.tolist()
    conf_matrix = confusion_matrix(y_true, y_pred, labels=categories)
    sns.heatmap(conf_matrix, xticklabels=categories, yticklabels=categories, annot=True, 
                fmt='d', cmap='Blues', square=True, annot_kws={'size': 8})
    plt.title(f'{dataset_name}: F1 macro={f1_macro:.2f}, micro={f1_micro:.2f}')
    plt.xlabel('Predicted')
    plt.ylabel('True Label')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'confusion_{dataset_name.lower()}.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    return {'f1_macro': f1_macro, 'f1_micro': f1_micro, 'total': len(df)}

# Run analysis
results = {}
results['GB'] = analyze_labels(GB, 'topic_consensus', 'True_label', exclude_topics=['Mix'], dataset_name="GB")
results['HR'] = analyze_labels(HR, 'topic_consensus', 'True_label', exclude_topics=['Mix'], dataset_name="HR")
results['AT'] = analyze_labels(AT, 'topic_consensus', 'Topic', exclude_topics=['Mix'], 
                               min_word_count=70, dataset_name="AT")

## Topic Distribution Comparison

In [None]:
def compare_topic_distributions(df, our_col, comparison_col, dataset_name, exclude_topics=['Other', 'Mix']):
    """Compare topic distributions"""
    clean = df.dropna(subset=[our_col, comparison_col])
    for topic in exclude_topics:
        clean = clean[(clean[our_col] != topic) & (clean[comparison_col] != topic)]
    
    # Check if we have enough data after filtering
    if len(clean) == 0:
        print(f"⚠️ Warning: {dataset_name} has no overlapping topics after filtering {exclude_topics}")
        return
    
    our_dist = clean[our_col].value_counts(normalize=True) * 100
    comp_dist = clean[comparison_col].value_counts(normalize=True) * 100
    
    # Check if we have any topics to compare
    if len(our_dist) == 0 or len(comp_dist) == 0:
        print(f"⚠️ Warning: {dataset_name} has no topics to compare after filtering")
        return
    
    comparison = pd.DataFrame({'Our': our_dist, 'Reference': comp_dist}).fillna(0)
    
    # Check if comparison has any rows
    if len(comparison) == 0:
        print(f"⚠️ Warning: {dataset_name} comparison dataframe is empty")
        return
    
    diff = (comparison['Our'] - comparison['Reference']).reindex(
        (comparison['Our'] - comparison['Reference']).abs().sort_values(ascending=True).index)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    comparison.plot(kind='barh', ax=ax1, width=0.8, alpha=0.8)
    ax1.set_title(f'{dataset_name}: Topic Distribution Comparison')
    ax1.set_xlabel('Percentage (%)')
    ax1.grid(axis='x', alpha=0.3)
    
    colors = ['red' if x < 0 else 'green' for x in diff]
    diff.plot(kind='barh', ax=ax2, color=colors, alpha=0.7)
    ax2.set_title(f'{dataset_name}: Difference (Our - Reference)')
    ax2.set_xlabel('Percentage Point Difference')
    ax2.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    ax2.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'dist_{dataset_name.lower()}.png'), dpi=300, bbox_inches='tight')
    plt.show()

compare_topic_distributions(GB, 'topic_consensus', 'True_label', 'GB')
compare_topic_distributions(HR, 'topic_consensus', 'True_label', 'HR')
compare_topic_distributions(AT, 'topic_consensus', 'Topic', 'AT')

print("✅ Topic classification evaluation complete")

---
# PART 2: LIWC Linguistic Analysis

Analyzing political discourse using LIWC-22 dimensions across countries, topics, and covariates.

In [None]:
# Load LIWC benchmarks
LIWC_statistics_path = r"data folder\LIWC-22.Descriptive.Statistics-Test.Kitchen.xlsx"

def load_liwc_benchmarks(file_path):
    """Load LIWC-22 population norms"""
    raw = pd.read_excel(file_path, sheet_name=0, header=None)
    header_row = raw.iloc[0]
    total_col_start = [i for i, v in enumerate(header_row) if str(v).strip() == 'Total'][0]
    
    dimensions = raw.iloc[2:, 0].dropna().reset_index(drop=True)
    means = pd.to_numeric(raw.iloc[2:2+len(dimensions), total_col_start], errors='coerce')
    stds = pd.to_numeric(raw.iloc[2:2+len(dimensions), total_col_start+1], errors='coerce')
    
    return pd.DataFrame({'Dimension': dimensions, 'Mean': means.values, 'Std': stds.values}).dropna()

LIWC_benchmarks = load_liwc_benchmarks(LIWC_statistics_path)

# Combine datasets for analysis
LIWC_ALL = pd.concat([AT, HR, GB], ignore_index=True)

# Rename for consistency
LIWC_ALL.rename(columns={'topic_consensus': 'Our_Topic', 'Topic': 'ParlaCAP'}, inplace=True)

# Filter out non-policy content
LIWC_ALL = LIWC_ALL[~LIWC_ALL['Our_Topic'].isin(['Mix', 'Other'])]
LIWC_ALL = LIWC_ALL[LIWC_ALL['Speaker_role'] != 'Chairperson']

# Key dimensions
KEY_LIWC_DIMENSIONS = [
    'Analytic', 'Clout', 'Authentic', 'Tone',
    'i', 'we', 'you', 'they', 'ipron', 'ppron',
    'focuspast', 'focuspresent', 'focusfuture',
    'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certitude',
    'Affect', 'tone_pos', 'tone_neg',
    'Social', 'conflict', 'moral',
    'power', 'politic', 'money', 'work'
]

print(f"✅ LIWC analysis ready: {len(LIWC_ALL):,} speeches")
print(f"   Countries: {LIWC_ALL['Country'].unique().tolist()}")
print(f"   Key dimensions: {len(KEY_LIWC_DIMENSIONS)}")

In [None]:
def create_country_heatmap(data, benchmarks, dimensions):
    """Z-score heatmap with categorical groupings and labels"""
    country_means = data.groupby('Country')[dimensions].mean()
    overall_means = data[dimensions].mean()
    country_means.loc['All Countries'] = overall_means
    
    benchmark_lookup = benchmarks.set_index('Dimension')
    z_scores = (country_means - benchmark_lookup.loc[dimensions, 'Mean']) / benchmark_lookup.loc[dimensions, 'Std']
    
    # Define groups exactly as in your image
    summary_vars = ['Analytic', 'Clout', 'Authentic', 'Tone']
    
    # Personal Pronouns group
    pronoun_vars = ['i', 'we', 'you', 'shehe', 'they', 'ipron', 'ppron']
    pronoun_vars = [v for v in pronoun_vars if v in dimensions]
    
    # Time Orientation group
    temporal_vars = ['focuspast', 'focuspresent', 'focusfuture']
    temporal_vars = [v for v in temporal_vars if v in dimensions]
    
    # Cognitive Processes group
    cognitive_vars = ['cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certitude']
    cognitive_vars = [v for v in cognitive_vars if v in dimensions]
    
    # Emotion group
    affect_vars = ['Affect', 'tone_pos', 'tone_neg']
    affect_vars = [v for v in affect_vars if v in dimensions]
    
    # Social group
    social_vars = ['Social', 'conflict', 'moral']
    social_vars = [v for v in social_vars if v in dimensions]
    
    # Power & Politics group
    political_vars = ['power', 'politic', 'money', 'work']
    political_vars = [v for v in political_vars if v in dimensions]
    
    # Build ordered list
    ordered_dims = []
    category_labels = []
    separator_positions = []
    
    # Add Summary Variables
    ordered_dims.extend(summary_vars)
    category_labels.append(('Summary\nVariables', 0, len(summary_vars)))
    separator_positions.append(len(ordered_dims))
    
    # Add Personal Pronouns
    if pronoun_vars:
        start_pos = len(ordered_dims)
        ordered_dims.extend(pronoun_vars)
        category_labels.append(('Personal\nPronouns', start_pos, len(ordered_dims)))
        separator_positions.append(len(ordered_dims))
    
    # Add Time Orientation
    if temporal_vars:
        start_pos = len(ordered_dims)
        ordered_dims.extend(temporal_vars)
        category_labels.append(('Time\nOrientation', start_pos, len(ordered_dims)))
        separator_positions.append(len(ordered_dims))
    
    # Add Cognitive Processes
    if cognitive_vars:
        start_pos = len(ordered_dims)
        ordered_dims.extend(cognitive_vars)
        category_labels.append(('Cognitive\nProcesses', start_pos, len(ordered_dims)))
        separator_positions.append(len(ordered_dims))
    
    # Add Emotion
    if affect_vars:
        start_pos = len(ordered_dims)
        ordered_dims.extend(affect_vars)
        category_labels.append(('Emotion', start_pos, len(ordered_dims)))
        separator_positions.append(len(ordered_dims))
    
    # Add Social
    if social_vars:
        start_pos = len(ordered_dims)
        ordered_dims.extend(social_vars)
        category_labels.append(('Social', start_pos, len(ordered_dims)))
        separator_positions.append(len(ordered_dims))
    
    # Add Power & Politics
    if political_vars:
        start_pos = len(ordered_dims)
        ordered_dims.extend(political_vars)
        category_labels.append(('Power\n&\nPolitics', start_pos, len(ordered_dims)))
        separator_positions.append(len(ordered_dims))
    
    z_scores_ordered = z_scores[ordered_dims].T
    
    fig, ax = plt.subplots(figsize=(10, 20))
    sns.heatmap(z_scores_ordered, annot=True, fmt=".2f", cmap="RdBu_r", center=0, 
                cbar_kws={'label': 'Z-Score (from population norm)'}, vmin=-2, vmax=2, ax=ax, square=False)
    
    # Add separator lines
    for pos in separator_positions[:-1]:
        ax.axhline(y=pos, color='black', linewidth=2)
    
    # Add category labels on the left
    for label_text, start, end in category_labels:
        y_pos = (start + end) / 2
        ax.text(-0.5, y_pos, label_text, 
                fontsize=11, fontweight='bold', color='darkblue',
                verticalalignment='center', horizontalalignment='right')
    
    ax.set_title("Political Discourse: LIWC Z-Scores vs. Population Norms\n(All KEY_LIWC_DIMENSIONS)", 
                 fontweight='bold', fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'liwc_country_zscores.png'), dpi=300, bbox_inches='tight')
    plt.show()

create_country_heatmap(LIWC_ALL, LIWC_benchmarks, KEY_LIWC_DIMENSIONS)

## Topic-LIWC Interaction Analysis

In [None]:
def create_topic_liwc_heatmap(data, benchmarks, topic_col='Our_Topic', 
                               top_n_topics=8):
    """Create topic-LIWC interaction heatmap for each country"""
    
    # Fixed dimensions (always first 5)
    fixed_dims = ['Analytic', 'Clout', 'Authentic', 'Tone', 'power']
    
    # Get top topics by frequency
    top_topics = data[topic_col].value_counts().head(top_n_topics).index.tolist()
    filtered_data = data[data[topic_col].isin(top_topics)].copy()
    
    # Z-score normalize all KEY_LIWC_DIMENSIONS
    benchmark_lookup = benchmarks.set_index('Dimension')
    for dim in KEY_LIWC_DIMENSIONS:
        if dim in benchmark_lookup.index:
            mean = benchmark_lookup.loc[dim, 'Mean']
            std = benchmark_lookup.loc[dim, 'Std']
            filtered_data[f'{dim}_z'] = (filtered_data[dim] - mean) / std
    
    countries = ['Austria', 'Croatia', 'Great Britain']
    fig, axes = plt.subplots(1, 3, figsize=(24, 12))
    
    for idx, country in enumerate(countries):
        country_data = filtered_data[filtered_data['Country'] == country]
        
        # Calculate z-scores by topic for all dimensions
        z_cols = [f'{d}_z' for d in KEY_LIWC_DIMENSIONS if f'{d}_z' in country_data.columns]
        topic_means = country_data.groupby(topic_col)[z_cols].mean()
        
        # Calculate average absolute z-score for each dimension (excluding fixed)
        other_cols = [c for c in topic_means.columns if c.replace('_z', '') not in fixed_dims]
        avg_abs_z = topic_means[other_cols].abs().mean(axis=0).sort_values(ascending=False)
        top_5_other = avg_abs_z.head(5).index.tolist()
        
        # Combine fixed + top 5 by z-score
        selected_cols = [f'{d}_z' for d in fixed_dims] + top_5_other
        topic_means_selected = topic_means[selected_cols]
        topic_means_selected.columns = [c.replace('_z', '') for c in topic_means_selected.columns]
        
        # Order topics by frequency
        topic_order = country_data[topic_col].value_counts().index
        topic_means_selected = topic_means_selected.reindex(topic_order)
        
        # Transpose for dimensions as rows
        topic_means_T = topic_means_selected.T
        
        # Plot
        ax = axes[idx]
        sns.heatmap(topic_means_T, annot=True, fmt='.2f', cmap='RdBu_r', 
                   center=0, vmin=-1.5, vmax=1.5, ax=ax, 
                   cbar_kws={'label': 'Z-Score'}, square=False)
        
        # Add separator line after first 5 dimensions
        ax.axhline(y=5, color='black', linewidth=2)
        
        ax.set_title(country, fontweight='bold', fontsize=14)
        ax.set_xlabel('Policy Topic', fontweight='bold')
        if idx == 0:
            ax.set_ylabel('LIWC Dimension', fontweight='bold')
        else:
            ax.set_ylabel('')
        
        plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
    
    fig.suptitle('Topic-LIWC Interaction: Linguistic Style by Policy Domain and Country', 
                 fontsize=16, fontweight='bold', y=1.00)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'topic_liwc_interaction.png'), 
                dpi=300, bbox_inches='tight')
    plt.show()

create_topic_liwc_heatmap(LIWC_ALL, LIWC_benchmarks)

In [None]:
def create_focal_topic_analysis(data, benchmarks, focal_topics):
    """Compare focal topics vs all other topics with dynamic top 5 selection"""
    
    # Fixed dimensions (always first 5)
    fixed_dims = ['Analytic', 'Authentic', 'Clout', 'Tone', 'power']
    
    countries = ['Austria', 'Croatia', 'Great Britain', 'All Countries']
    
    # First pass: calculate all z-score differences to determine top 5
    all_diffs = []
    for focal_topic in focal_topics:
        for country in countries:
            if country == 'All Countries':
                country_data = data.copy()
            else:
                country_data = data[data['Country'] == country].copy()
            
            focal_data = country_data[country_data['Our_Topic'] == focal_topic]
            other_data = country_data[country_data['Our_Topic'] != focal_topic]
            
            benchmark_lookup = benchmarks.set_index('Dimension')
            for dim in KEY_LIWC_DIMENSIONS:
                if dim in benchmark_lookup.index and dim not in fixed_dims:
                    mean = benchmark_lookup.loc[dim, 'Mean']
                    std = benchmark_lookup.loc[dim, 'Std']
                    
                    focal_z = (focal_data[dim].mean() - mean) / std
                    other_z = (other_data[dim].mean() - mean) / std
                    diff = abs(focal_z - other_z)
                    all_diffs.append({'dimension': dim, 'diff': diff})
    
    # Get top 5 dimensions by average absolute difference
    diff_df = pd.DataFrame(all_diffs)
    top_5_other = diff_df.groupby('dimension')['diff'].mean().nlargest(5).index.tolist()
    selected_dims = fixed_dims + top_5_other
    
    # Second pass: calculate results with selected dimensions
    results = {}
    for focal_topic in focal_topics:
        topic_results = []
        
        for country in countries:
            if country == 'All Countries':
                country_data = data.copy()
            else:
                country_data = data[data['Country'] == country].copy()
            
            focal_data = country_data[country_data['Our_Topic'] == focal_topic]
            other_data = country_data[country_data['Our_Topic'] != focal_topic]
            
            benchmark_lookup = benchmarks.set_index('Dimension')
            diffs = {}
            for dim in selected_dims:
                if dim in benchmark_lookup.index:
                    mean = benchmark_lookup.loc[dim, 'Mean']
                    std = benchmark_lookup.loc[dim, 'Std']
                    
                    focal_z = (focal_data[dim].mean() - mean) / std
                    other_z = (other_data[dim].mean() - mean) / std
                    diffs[dim] = focal_z - other_z
            
            topic_results.append(diffs)
        
        results[focal_topic] = pd.DataFrame(topic_results, index=countries).T
    
    # Create subplots
    n_topics = len(focal_topics)
    fig, axes = plt.subplots(1, n_topics, figsize=(12 * n_topics, 8))
    if n_topics == 1:
        axes = [axes]
    
    for idx, focal_topic in enumerate(focal_topics):
        ax = axes[idx]
        data_to_plot = results[focal_topic]
        
        sns.heatmap(data_to_plot, annot=True, fmt='.2f', cmap='RdBu_r', 
                   center=0, vmin=-1, vmax=1, ax=ax, square=False,
                   cbar_kws={'label': f'Z-Score Difference (Focal Topic - Other)'})
        
        # Add separator line after first 5 dimensions
        ax.axhline(y=5, color='black', linewidth=2)
        
        ax.set_title(f'{focal_topic} vs Other Topics', fontweight='bold', fontsize=13)
        ax.set_xlabel('')
        if idx == 0:
            ax.set_ylabel('LIWC Dimension', fontweight='bold')
        else:
            ax.set_ylabel('')
    
    fig.suptitle('Focal Topics Analysis: Linguistic Differences', 
                 fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'focal_topics_analysis.png'), 
                dpi=300, bbox_inches='tight')
    plt.show()

create_focal_topic_analysis(LIWC_ALL, LIWC_benchmarks, 
                            focal_topics=['Macroeconomics', 'Health'])

## Political Covariates Analysis

In [None]:
def create_covariate_heatmap_with_counts(data, benchmarks, covariate_col, categories, 
                                         dimensions, title, filename, order_by_effect=False):
    """Covariate heatmap with sample sizes in labels"""
    filtered = data[data[covariate_col].isin(categories)].copy()
    
    # Get sample sizes
    sample_sizes = filtered[covariate_col].value_counts()
    
    # Z-score normalize
    benchmark_lookup = benchmarks.set_index('Dimension')
    for dim in dimensions:
        if dim in benchmark_lookup.index:
            mean = benchmark_lookup.loc[dim, 'Mean']
            std = benchmark_lookup.loc[dim, 'Std']
            filtered[f'{dim}_z'] = (filtered[dim] - mean) / std
    
    means = filtered.groupby(covariate_col)[[f'{d}_z' for d in dimensions]].mean()
    means.columns = [c.replace('_z', '') for c in means.columns]
    
    # Order by effect size if requested
    if order_by_effect and len(categories) == 2:
        diff = (means.iloc[0] - means.iloc[1]).abs().sort_values(ascending=False)
        means = means[diff.index]
    
    # Add horizontal line after first group (summary variables + top effect)
    separator_position = None
    if 'power' in means.columns:
        separator_position = list(means.columns).index('power') + 1
    
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(means.T, annot=True, fmt='.2f', cmap='RdBu_r', center=0, 
                vmin=-1, vmax=1, ax=ax, square=False,
                cbar_kws={'label': 'Z-Score Diff (Coalition - Opposition)' if 'Coalition' in categories 
                         else 'Z-Score Diff (Female - Male)' if 'F' in categories
                         else 'Z-Score'})
    
    # Add separator line
    if separator_position:
        ax.axhline(y=separator_position, color='black', linewidth=2)
    
    ax.set_title(title, fontweight='bold')
    
    # Update column labels with sample sizes
    col_labels = [f"{cat}\n(N={sample_sizes[cat]:,})" for cat in means.index]
    ax.set_xticklabels(col_labels, rotation=0)
    
    # Add total sample size info at bottom
    total_n = sum(sample_sizes)
    coalition_n = sample_sizes.get('Coalition', 0)
    opp_n = sample_sizes.get('Opposition', 0)
    
    if coalition_n and opp_n:
        bottom_text = f"(Coal={coalition_n:,}, Opp={opp_n:,})"
        fig.text(0.5, -0.02, bottom_text, ha='center', fontsize=10)
    elif 'F' in sample_sizes.index and 'M' in sample_sizes.index:
        f_n = sample_sizes['F']
        m_n = sample_sizes['M']
        bottom_text = f"(F={f_n:,}, M={m_n:,})"
        fig.text(0.5, -0.02, bottom_text, ha='center', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
    plt.show()

def create_country_split_heatmap(data, benchmarks, covariate_col, categories, 
                                 dimensions, title, filename):
    """Create side-by-side heatmaps for each country with 10 dimensions (5 fixed + top 5 by effect)"""
    countries = ['Austria', 'Croatia', 'Great Britain', 'All Countries']
    
    # Fixed dimensions (always first 5)
    fixed_dims = ['Analytic', 'Clout', 'Authentic', 'Tone', 'power']
    
    results = []
    for country in countries:
        if country == 'All Countries':
            country_data = data.copy()
        else:
            country_data = data[data['Country'] == country].copy()
        
        filtered = country_data[country_data[covariate_col].isin(categories)].copy()
        
        # Z-score normalize
        benchmark_lookup = benchmarks.set_index('Dimension')
        for dim in dimensions:
            if dim in benchmark_lookup.index:
                mean = benchmark_lookup.loc[dim, 'Mean']
                std = benchmark_lookup.loc[dim, 'Std']
                filtered[f'{dim}_z'] = (filtered[dim] - mean) / std
        
        means = filtered.groupby(covariate_col)[[f'{d}_z' for d in dimensions]].mean()
        means.columns = [c.replace('_z', '') for c in means.columns]
        
        # Calculate difference
        if len(categories) == 2:
            diff = means.iloc[0] - means.iloc[1]
            results.append(diff)
   
    # Create dataframe with all countries
    all_results = pd.DataFrame(results, index=countries).T
    
    # Order by absolute difference and select top 5 non-fixed dimensions
    non_fixed_dims = [d for d in all_results.index if d not in fixed_dims]
    avg_abs = all_results.loc[non_fixed_dims].abs().mean(axis=1).sort_values(ascending=False)
    top_5_other = avg_abs.head(5).index.tolist()
    
    # Combine fixed + top 5
    selected_dims = fixed_dims + top_5_other
    all_results = all_results.loc[selected_dims]
    
    fig, ax = plt.subplots(figsize=(12, 8))
    sns.heatmap(all_results, annot=True, fmt='.2f', cmap='RdBu_r', center=0, 
                vmin=-1, vmax=1, ax=ax, square=False,
                cbar_kws={'label': 'Z-Score Diff (Coalition - Opposition)' if categories[0] == 'Coalition'
                         else 'Z-Score Diff (Female - Male)'})
    
    # Add separator line after first 5 dimensions
    ax.axhline(y=5, color='black', linewidth=2)
    
    ax.set_title(title, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
    plt.show()

# Coalition vs Opposition - 10 dimensions with separator
create_country_split_heatmap(LIWC_ALL, LIWC_benchmarks, 'Party_status', 
                             ['Coalition', 'Opposition'], KEY_LIWC_DIMENSIONS,
                             'Coalition vs Opposition: Top 10 Linguistic Differences', 
                             'liwc_party_status.png')

# Gender Differences - 10 dimensions with separator
create_country_split_heatmap(LIWC_ALL, LIWC_benchmarks, 'Speaker_gender', 
                             ['M', 'F'], KEY_LIWC_DIMENSIONS,
                             'Gender Differences: Top 10 Linguistic Dimensions', 
                             'liwc_gender.png')

# Political Orientation (Left-Right spectrum)
def create_political_orientation_heatmap(data, benchmarks, dimensions):
    """Create political orientation heatmap"""
    # Define political orientation categories
    orientation_map = {
        'Left': ['N='],  # Add your left parties
        'Centre-left': [],
        'Centre': [],
        'Centre-right': [],
        'Right': []
    }
    
    # Filter data with political orientation
    filtered = data.dropna(subset=['Party_name']).copy()
    
    # Z-score normalize
    benchmark_lookup = benchmarks.set_index('Dimension')
    for dim in dimensions:
        if dim in benchmark_lookup.index:
            mean = benchmark_lookup.loc[dim, 'Mean']
            std = benchmark_lookup.loc[dim, 'Std']
            filtered[f'{dim}_z'] = (filtered[dim] - mean) / std
    
    # Group by party and calculate means
    party_means = filtered.groupby('Party_name')[[f'{d}_z' for d in dimensions]].mean()
    party_means.columns = [c.replace('_z', '') for c in party_means.columns]
    
    # Get sample sizes
    party_counts = filtered['Party_name'].value_counts()
    
    # Select key dimensions for display
    display_dims = ['Analytic', 'Authentic', 'Clout', 'Tone', 'power', 
                   'moral', 'politic', 'ipron', 'cogproc', 'you']
    party_means = party_means[display_dims]
    
    # Sort by sample size
    party_means = party_means.loc[party_counts.index]
    
    # Create labels with sample sizes
    party_labels = [f"{party}\n(N={party_counts[party]:,})" 
                   for party in party_means.index]
    
    fig, ax = plt.subplots(figsize=(18, 10))
    sns.heatmap(party_means.T, annot=True, fmt='.2f', cmap='RdBu_r', 
               center=0, vmin=-2, vmax=2, ax=ax, square=False,
               cbar_kws={'label': 'Z-Score'})
    
    ax.set_xticklabels(party_labels, rotation=45, ha='right')
    ax.set_title('Political Orientation Differences: Linguistic Dimensions\n(Left to Right spectrum)', 
                fontweight='bold', fontsize=14)
    ax.set_xlabel('Political Orientation', fontweight='bold')
    ax.set_ylabel('LIWC Dimension', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'political_orientation.png'), 
                dpi=300, bbox_inches='tight')
    plt.show()

# Age Groups
if 'Speaker_age' in LIWC_ALL.columns:
    age_data = LIWC_ALL.dropna(subset=['Speaker_age']).copy()
    age_data['Age_Group'] = pd.cut(age_data['Speaker_age'], 
                                     bins=[0, 35, 50, 65, 100], 
                                     labels=['<35', '35-50', '51-65', '66+'])
    
    age_dims = ['Analytic', 'Authentic', 'Clout', 'Tone', 'power', 
                'ipron', 'politic', 'certitude', 'money', 'Social']
    
    # Get sample sizes
    age_counts = age_data['Age_Group'].value_counts()
    
    # Z-score and group
    benchmark_lookup = LIWC_benchmarks.set_index('Dimension')
    for dim in age_dims:
        if dim in benchmark_lookup.index:
            mean = benchmark_lookup.loc[dim, 'Mean']
            std = benchmark_lookup.loc[dim, 'Std']
            age_data[f'{dim}_z'] = (age_data[dim] - mean) / std
    
    means = age_data.groupby('Age_Group')[[f'{d}_z' for d in age_dims]].mean()
    means.columns = [c.replace('_z', '') for c in means.columns]
    
    # Create labels with sample sizes
    age_labels = [f"{age}\n(N={age_counts[age]:,})" for age in means.index]
    
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(means.T, annot=True, fmt='.2f', cmap='RdBu_r', center=0, 
                vmin=-2, vmax=2, ax=ax, square=False,
                cbar_kws={'label': 'Z-Score'})
    
    # Add separator line after first 5 dimensions
    ax.axhline(y=5, color='black', linewidth=2)
    
    ax.set_xticklabels(age_labels, rotation=0)
    ax.set_title('Age Group Differences: Linguistic Dimensions', fontweight='bold')
    ax.set_xlabel('Age Group', fontweight='bold')
    ax.set_ylabel('LIWC Dimension', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'liwc_age_groups.png'), 
                dpi=300, bbox_inches='tight')
    plt.show()

print("✅ All covariate analyses complete")

## Temporal Evolution Analysis

In [None]:
def create_temporal_plot(data, benchmarks, dimensions, countries, title, filename, add_events=True):
    """Temporal evolution with shared x-axis, background shading, and event markers"""
    benchmark_lookup = benchmarks.set_index('Dimension')
    temporal_data = data[data['Date'].notna()].copy()
    
    for dim in dimensions:
        mean = benchmark_lookup.loc[dim, 'Mean']
        std = benchmark_lookup.loc[dim, 'Std']
        temporal_data[f'{dim}_z'] = (temporal_data[dim] - mean) / std
    
    # Determine overall date range from all countries
    min_date = temporal_data['Date'].min()
    max_date = temporal_data['Date'].max()
    
    # Key events to annotate
    events = {
        'Austria': [
            ('2008-09-15', 'Financial\nCrisis'),
            ('2015-09-01', 'Refugee\nCrisis'),
            ('2020-03-16', 'COVID-19'),
            ('2022-02-24', 'Ukraine\nWar')
        ],
        'Croatia': [
            ('2008-09-15', 'Financial\nCrisis'),
            ('2013-07-01', 'EU\nAccession'),
            ('2015-09-01', 'Refugee\nCrisis'),
            ('2020-03-16', 'COVID-19'),
            ('2022-02-24', 'Ukraine\nWar')
        ],
        'Great Britain': [
            ('2016-06-23', 'Brexit\nVote'),
            ('2020-01-31', 'Brexit'),
            ('2020-03-23', 'COVID-19'),
            ('2022-02-24', 'Ukraine\nWar')
        ]
    }
    
    # Political periods - simplified to 2 colors alternating at elections
    periods = {
        'Austria': [
            ('1996-01-01', '2000-02-04', '#8DB4E2'),  # Color 1
            ('2000-02-04', '2007-01-11', '#C4D79B'),  # Color 2 (election)
            ('2007-01-11', '2017-12-18', '#8DB4E2'),  # Color 1 (election)
            ('2017-12-18', '2019-06-03', '#C4D79B'),  # Color 2 (election)
            ('2019-06-03', '2020-01-07', '#8DB4E2'),  # Color 1 (election)
            ('2020-01-07', '2022-12-31', '#C4D79B')   # Color 2 (election)
        ],
        'Croatia': [
            ('2004-01-01', '2011-12-23', '#8DB4E2'),  # Color 1
            ('2011-12-23', '2016-01-22', '#C4D79B'),  # Color 2 (election)
            ('2016-01-22', '2022-12-31', '#8DB4E2')   # Color 1 (election)
        ],
        'Great Britain': [
            ('2015-01-01', '2016-07-13', '#8DB4E2'),  # Color 1
            ('2016-07-13', '2019-07-24', '#C4D79B'),  # Color 2 (election)
            ('2019-07-24', '2022-12-31', '#8DB4E2')   # Color 1 (election)
        ]
    }
    
    fig, axes = plt.subplots(3, 1, figsize=(20, 14), sharex=True)
    fig.suptitle(title, fontsize=16, fontweight='bold', y=0.995)
    
    for idx, country in enumerate(countries):
        ax = axes[idx]
        
        # Add background shading for political periods
        if add_events and country in periods:
            for start, end, color in periods[country]:
                start_date = pd.to_datetime(start)
                end_date = pd.to_datetime(end)
                ax.axvspan(start_date, end_date, alpha=0.15, color=color, zorder=0)
        
        # Plot data
        country_data = temporal_data[temporal_data['Country'] == country].copy()
        country_data['YearMonth'] = country_data['Date'].dt.to_period('M')
        
        monthly = country_data.groupby('YearMonth')[[f'{d}_z' for d in dimensions]].mean()
        monthly.columns = [c.replace('_z', '') for c in monthly.columns]
        monthly.index = monthly.index.to_timestamp()
        monthly_smooth = monthly.rolling(window=3, center=True).mean()
        
        for dim in monthly_smooth.columns:
            ax.plot(monthly_smooth.index, monthly_smooth[dim], linewidth=2.5, label=dim, alpha=0.9, zorder=2)
        
        # Add event annotations
        if add_events and country in events:
            for event_date, event_label in events[country]:
                event_dt = pd.to_datetime(event_date)
                if min_date <= event_dt <= max_date:
                    # Draw vertical line
                    ax.axvline(x=event_dt, color='red', linestyle='--', 
                              linewidth=1.5, alpha=0.6, zorder=1)
                    
                    # Add text annotation at top of plot
                    ylim = ax.get_ylim()
                    y_pos = ylim[1] * 0.95
                    ax.text(event_dt, y_pos, event_label, 
                           rotation=90, verticalalignment='top', horizontalalignment='right',
                           fontsize=8, color='red', fontweight='bold', alpha=0.8)
        
        ax.axhline(y=0, color='gray', linestyle='-', alpha=0.5, linewidth=1.5, zorder=1)
        ax.set_title(country, fontweight='bold', loc='left', fontsize=13)
        ax.set_ylabel('Z-Score (3-month avg)', fontweight='bold')
        ax.set_xlabel('Year', fontweight='bold', fontsize=12)
        ax.grid(True, alpha=0.2, linestyle=':', zorder=0)
        ax.set_xlim(min_date, max_date)
        
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
        
        # Legend - only dimension lines, no events
        if idx == 0:
            handles, labels = ax.get_legend_handles_labels()
            # Filter to only include dimension lines (not event markers)
            dim_handles = [h for h, l in zip(handles, labels) if l in dimensions]
            dim_labels = [l for l in labels if l in dimensions]
            ax.legend(dim_handles, dim_labels, 
                     loc='upper left', ncol=len(dimensions), fontsize=9, framealpha=0.9)
    
    plt.tight_layout(rect=[0, 0.01, 1, 0.99])
    plt.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
    plt.show()

# Summary Variables
create_temporal_plot(LIWC_ALL, LIWC_benchmarks, ['Analytic', 'Authentic', 'Clout', 'Tone'],
                    ['Austria', 'Croatia', 'Great Britain'], 
                    'Temporal Evolution: Summary Variables', 'liwc_temporal_summary.png')

# Political Language
create_temporal_plot(LIWC_ALL, LIWC_benchmarks, ['politic', 'power', 'moral', 'money'],
                    ['Austria', 'Croatia', 'Great Britain'],
                    'Temporal Evolution: Political Language', 'liwc_temporal_political.png')

# Pronouns
create_temporal_plot(LIWC_ALL, LIWC_benchmarks, ['i', 'we', 'you', 'they'],
                    ['Austria', 'Croatia', 'Great Britain'],
                    'Temporal Evolution: Pronoun Usage', 'liwc_temporal_pronouns.png')

# Cognitive Processes
create_temporal_plot(LIWC_ALL, LIWC_benchmarks, ['cogproc', 'insight', 'cause', 'certitude'],
                    ['Austria', 'Croatia', 'Great Britain'],
                    'Temporal Evolution: Cognitive Processes', 'liwc_temporal_cognitive.png')

# Temporal Focus
create_temporal_plot(LIWC_ALL, LIWC_benchmarks, ['focuspast', 'focuspresent', 'focusfuture'],
                    ['Austria', 'Croatia', 'Great Britain'],
                    'Temporal Evolution: Time Orientation', 'liwc_temporal_time.png')

# Affect and Tone
create_temporal_plot(LIWC_ALL, LIWC_benchmarks, ['Affect', 'tone_pos', 'tone_neg'],
                    ['Austria', 'Croatia', 'Great Britain'],
                    'Temporal Evolution: Emotional Language', 'liwc_temporal_affect.png')

print("✅ All temporal plots created with events and political periods")