In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import ttest_ind, pearsonr, f_oneway
import warnings
import os

# Set style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams.update({
    'font.family': 'serif',
    'font.size': 11,
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'figure.titlesize': 14
})
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')

# Output directory
output_dir = r"C:\Users\pavle\OneDrive\Desktop\my github\master-thesis\figures"
os.makedirs(output_dir, exist_ok=True)

# PART 1: Topic Classification Evaluation

Evaluating our topic classification against ParlaCAP predictions and human labels.

In [None]:
# Load datasets with topic modeling results
GB = pd.read_pickle(r"data folder\GB\GB_final_with_topics.pkl")
AT = pd.read_pickle(r"data folder\AT\AT_final_with_topics_combined.pkl")
HR = pd.read_pickle(r"data folder\HR\HR_final_with_topics_combined.pkl")

# Load human labels
hr_labels = pd.read_json("data folder/HR/ParlaCAP-test-hr.jsonl", lines=True)
gb_labels = pd.read_json("data folder/GB/ParlaCAP-test-en.jsonl", lines=True)

# Drop duplicates and merge
for df in [HR, GB, AT]:
    df.drop_duplicates(subset=['ID'], inplace=True)

HR = HR.merge(hr_labels[['id', 'labels']], left_on='ID', right_on='id', how='left')
HR.rename(columns={'labels': 'True_label'}, inplace=True)
HR.drop(columns=['id'], inplace=True)

GB = GB.merge(gb_labels[['id', 'labels']], left_on='ID', right_on='id', how='left')
GB.rename(columns={'labels': 'True_label'}, inplace=True)
GB.drop(columns=['id'], inplace=True)

# Load HDBSCAN results
GB_hdbscan = pd.read_pickle(r"data folder\GB\GB_with_topics_hdbscan.pkl")
AT_hdbscan = pd.read_pickle(r"data folder\AT\AT_with_topics_hdbscan.pkl")
HR_hdbscan = pd.read_pickle(r"data folder\HR\HR_with_topics_hdbscan.pkl")

# Append HDBSCAN topics
GB['my_topic_hdbscan'] = GB_hdbscan['my_topic']
HR['my_topic_en_hdbscan'] = HR_hdbscan['my_topic_en']
HR['my_topic_hr_hdbscan'] = HR_hdbscan['my_topic_native_language']
AT['my_topic_en_hdbscan'] = AT_hdbscan['my_topic_en']
AT['my_topic_de_hdbscan'] = AT_hdbscan['my_topic_native_language']

# Rename GMM columns
HR.rename(columns={'Segment_Category_HR_english': 'my_topic_en_gmm',
                   'Segment_Category_HR_croatian': 'my_topic_hr_gmm'}, inplace=True)
AT.rename(columns={'Segment_Category_AT_english': 'my_topic_en_gmm',
                   'Segment_Category_AT_german': 'my_topic_de_gmm'}, inplace=True)
GB.rename(columns={'Segment_Category_GB_english': 'my_topic_gmm'}, inplace=True)

print(f"✅ Loaded: AT={AT.shape}, HR={HR.shape}, GB={GB.shape}")

## Determine Topic Consensus

Create consensus topics by combining GMM and HDBSCAN predictions.

In [None]:
def determine_consensus(row, topic_cols, is_chairperson_col='Speaker_role'):
    """Determine consensus topic from multiple predictions"""
    topics = [row[col] for col in topic_cols]
    topics = [t for t in topics if t != 'Other']
    
    if row.get(is_chairperson_col) == 'Chairperson':
        return topics[0] if len(set(topics)) == 1 and len(topics) == len(topic_cols) else 'Other'
    else:
        if not topics:
            return 'Other'
        topic_counts = pd.Series(topics).value_counts()
        if topic_counts.iloc[0] > 1:
            return topic_counts.idxmax()
        if len(topic_counts) > 1 and topic_counts.iloc[0] == topic_counts.iloc[1]:
            return 'Mix'
        return topic_counts.idxmax()

# Apply consensus
HR['topic_consensus'] = HR.apply(lambda r: determine_consensus(
    r, ['my_topic_en_gmm', 'my_topic_hr_gmm', 'my_topic_en_hdbscan', 'my_topic_hr_hdbscan']), axis=1)

AT['topic_consensus'] = AT.apply(lambda r: determine_consensus(
    r, ['my_topic_en_gmm', 'my_topic_de_gmm', 'my_topic_en_hdbscan', 'my_topic_de_hdbscan']), axis=1)

GB['topic_consensus'] = GB.apply(lambda r: determine_consensus(
    r, ['my_topic_gmm', 'my_topic_hdbscan']), axis=1)

print("✅ Consensus topics determined")

## Confusion Matrix Analysis

In [None]:
def analyze_labels(df, evaluation_col, true_label_col, sample_size='all', min_word_count=None,
                   exclude_roles=None, exclude_topics=None, text_column='Text', 
                   plot_size=(16, 12), dataset_name="Dataset"):
    """Generate confusion matrix and metrics"""
    
    exclude_roles = exclude_roles or []
    exclude_topics = exclude_topics or []
    
    # Filter
    df = df[~df[true_label_col].isin(['-'])].dropna(subset=[evaluation_col, true_label_col])
    
    if exclude_roles:
        df = df[~df['Speaker_role'].isin(exclude_roles)]
    if exclude_topics:
        df = df[~df[evaluation_col].isin(exclude_topics)]
        df = df[~df[true_label_col].isin(exclude_topics)]
    if min_word_count:
        df = df[df[text_column].apply(lambda x: len(str(x).split())) >= min_word_count]
    
    # Sample
    if sample_size != 'all':
        sampled = []
        for cat in df[true_label_col].unique():
            cat_df = df[df[true_label_col] == cat]
            sampled.append(cat_df.sample(n=min(sample_size, len(cat_df))))
        df = pd.concat(sampled, ignore_index=True)
    
    # Metrics
    y_true = df[true_label_col]
    y_pred = df[evaluation_col]
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)
    
    # Plot
    plt.figure(figsize=plot_size)
    categories = y_true.value_counts().index.tolist()
    conf_matrix = confusion_matrix(y_true, y_pred, labels=categories)
    sns.heatmap(conf_matrix, xticklabels=categories, yticklabels=categories, annot=True, 
                fmt='d', cmap='Blues', square=True, annot_kws={'size': 8})
    plt.title(f'{dataset_name}: F1 macro={f1_macro:.2f}, micro={f1_micro:.2f}')
    plt.xlabel('Predicted')
    plt.ylabel('True Label')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'confusion_{dataset_name.lower()}.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    return {'f1_macro': f1_macro, 'f1_micro': f1_micro, 'total': len(df)}

# Run analysis
results = {}
results['GB'] = analyze_labels(GB, 'topic_consensus', 'True_label', exclude_topics=['Mix'], dataset_name="GB")
results['HR'] = analyze_labels(HR, 'topic_consensus', 'True_label', exclude_topics=['Mix'], dataset_name="HR")
results['AT'] = analyze_labels(AT, 'topic_consensus', 'Topic', exclude_topics=['Mix'], 
                               min_word_count=70, dataset_name="AT")

## Topic Distribution Comparison

In [None]:
def compare_topic_distributions(df, our_col, comparison_col, dataset_name, exclude_topics=['Other', 'Mix']):
    """Compare topic distributions"""
    clean = df.dropna(subset=[our_col, comparison_col])
    for topic in exclude_topics:
        clean = clean[(clean[our_col] != topic) & (clean[comparison_col] != topic)]
    
    our_dist = clean[our_col].value_counts(normalize=True) * 100
    comp_dist = clean[comparison_col].value_counts(normalize=True) * 100
    
    comparison = pd.DataFrame({'Our': our_dist, 'Reference': comp_dist}).fillna(0)
    diff = (comparison['Our'] - comparison['Reference']).reindex(
        (comparison['Our'] - comparison['Reference']).abs().sort_values(ascending=True).index)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    comparison.plot(kind='barh', ax=ax1, width=0.8, alpha=0.8)
    ax1.set_title(f'{dataset_name}: Topic Distribution Comparison')
    ax1.set_xlabel('Percentage (%)')
    ax1.grid(axis='x', alpha=0.3)
    
    colors = ['red' if x < 0 else 'green' for x in diff]
    diff.plot(kind='barh', ax=ax2, color=colors, alpha=0.7)
    ax2.set_title(f'{dataset_name}: Difference (Our - Reference)')
    ax2.set_xlabel('Percentage Point Difference')
    ax2.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    ax2.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'dist_{dataset_name.lower()}.png'), dpi=300, bbox_inches='tight')
    plt.show()

compare_topic_distributions(GB, 'topic_consensus', 'Topic', 'GB')
compare_topic_distributions(HR, 'topic_consensus', 'Topic', 'HR')
compare_topic_distributions(AT, 'topic_consensus', 'Topic', 'AT')

print("✅ Topic classification evaluation complete")

---
# PART 2: LIWC Linguistic Analysis

Analyzing political discourse using LIWC-22 dimensions across countries, topics, and covariates.

In [None]:
# Load LIWC data
AT_LIWC = pd.read_csv(r"data folder\AT\AT_LIWC_results.csv")
HR_LIWC = pd.read_csv(r"data folder\HR\HR_LIWC_results.csv")
GB_LIWC = pd.read_csv(r"data folder\GB\GB_LIWC_results.csv")

# Merge with existing data
AT = AT.merge(AT_LIWC, on='ID', how='inner')
HR = HR.merge(HR_LIWC, on='ID', how='inner')
GB = GB.merge(GB_LIWC, on='ID', how='inner')

# Add country identifier
AT['Country'] = 'Austria'
HR['Country'] = 'Croatia'
GB['Country'] = 'Great Britain'

# Combine
LIWC_ALL = pd.concat([AT, HR, GB], ignore_index=True)

# Process dates
LIWC_ALL['Date'] = pd.to_datetime(LIWC_ALL['Date'], errors='coerce')
LIWC_ALL['Year'] = LIWC_ALL['Date'].dt.year

# Rename and filter
LIWC_ALL.rename(columns={'topic_consensus': 'Our_Topic', 'Topic': 'ParlaCAP'}, inplace=True)
LIWC_ALL = LIWC_ALL[~LIWC_ALL['Our_Topic'].isin(['Mix', 'Other'])]
LIWC_ALL = LIWC_ALL[LIWC_ALL['Speaker_role'] != 'Chairperson']

# Calculate age
LIWC_ALL['Speaker_birth'] = pd.to_numeric(LIWC_ALL['Speaker_birth'], errors='coerce')
LIWC_ALL['Speaker_age'] = LIWC_ALL['Year'] - LIWC_ALL['Speaker_birth']

# Load LIWC benchmarks
LIWC_statistics_path = r"data folder\LIWC-22.Descriptive.Statistics-Test.Kitchen.xlsx"

def load_liwc_benchmarks(file_path):
    """Load LIWC-22 population norms"""
    raw = pd.read_excel(file_path, sheet_name=0, header=None)
    header_row = raw.iloc[0]
    total_col_start = [i for i, v in enumerate(header_row) if str(v).strip() == 'Total'][0]
    
    dimensions = raw.iloc[2:, 0].dropna().reset_index(drop=True)
    means = pd.to_numeric(raw.iloc[2:2+len(dimensions), total_col_start], errors='coerce')
    stds = pd.to_numeric(raw.iloc[2:2+len(dimensions), total_col_start+1], errors='coerce')
    
    return pd.DataFrame({'Dimension': dimensions, 'Mean': means.values, 'Std': stds.values}).dropna()

LIWC_benchmarks = load_liwc_benchmarks(LIWC_statistics_path)

# Key dimensions
KEY_LIWC_DIMENSIONS = [
    'Analytic', 'Clout', 'Authentic', 'Tone',
    'i', 'we', 'you', 'they', 'ipron', 'ppron',
    'focuspast', 'focuspresent', 'focusfuture',
    'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certitude',
    'Affect', 'tone_pos', 'tone_neg',
    'Social', 'conflict', 'moral',
    'power', 'politic', 'money', 'work'
]

print(f"✅ LIWC data loaded: {len(LIWC_ALL):,} speeches")

## Cross-Country LIWC Profiles

In [None]:
def create_country_heatmap(data, benchmarks, dimensions):
    """Z-score heatmap across countries"""
    country_means = data.groupby('Country')[dimensions].mean()
    overall_means = data[dimensions].mean()
    country_means.loc['All Countries'] = overall_means
    
    benchmark_lookup = benchmarks.set_index('Dimension')
    z_scores = (country_means - benchmark_lookup.loc[dimensions, 'Mean']) / benchmark_lookup.loc[dimensions, 'Std']
    
    fig, ax = plt.subplots(figsize=(10, 14))
    sns.heatmap(z_scores.T, annot=True, fmt=".2f", cmap="RdBu_r", center=0, 
                cbar_kws={'label': 'Z-Score'}, vmin=-2, vmax=2, ax=ax, square=False)
    ax.set_title("Political Discourse: LIWC Z-Scores vs Population Norms", fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'liwc_country_zscores.png'), dpi=300, bbox_inches='tight')
    plt.show()

create_country_heatmap(LIWC_ALL, LIWC_benchmarks, KEY_LIWC_DIMENSIONS)

## Political Covariates Analysis

In [None]:
def create_covariate_heatmap(data, benchmarks, covariate_col, categories, dimensions, title, filename):
    """Covariate analysis heatmap"""
    filtered = data[data[covariate_col].isin(categories)].copy()
    
    # Z-score normalize
    for dim in dimensions:
        mean = benchmarks.set_index('Dimension').loc[dim, 'Mean']
        std = benchmarks.set_index('Dimension').loc[dim, 'Std']
        filtered[f'{dim}_z'] = (filtered[dim] - mean) / std
    
    means = filtered.groupby(covariate_col)[[f'{d}_z' for d in dimensions]].mean()
    means.columns = [c.replace('_z', '') for c in means.columns]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.heatmap(means.T, annot=True, fmt='.2f', cmap='RdBu_r', center=0, 
                vmin=-1, vmax=1, ax=ax, square=False)
    ax.set_title(title, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
    plt.show()

# Fixed dimensions for all plots
FIXED_DIMS = ['Analytic', 'Authentic', 'Clout', 'Tone', 'power']

# Party Status
create_covariate_heatmap(LIWC_ALL, LIWC_benchmarks, 'Party_status', 
                         ['Coalition', 'Opposition'], FIXED_DIMS + ['we', 'i', 'cogproc', 'moral', 'politic'],
                         'Coalition vs Opposition', 'liwc_party_status.png')

# Gender
create_covariate_heatmap(LIWC_ALL, LIWC_benchmarks, 'Speaker_gender', 
                         ['M', 'F'], FIXED_DIMS + ['we', 'i', 'Affect', 'certitude', 'tentat'],
                         'Gender Differences', 'liwc_gender.png')

## Temporal Evolution Analysis

In [None]:
def create_temporal_plot(data, benchmarks, dimensions, countries, title, filename):
    """Temporal evolution with events"""
    benchmark_lookup = benchmarks.set_index('Dimension')
    temporal_data = data[data['Date'].notna()].copy()
    
    for dim in dimensions:
        mean = benchmark_lookup.loc[dim, 'Mean']
        std = benchmark_lookup.loc[dim, 'Std']
        temporal_data[f'{dim}_z'] = (temporal_data[dim] - mean) / std
    
    fig, axes = plt.subplots(3, 1, figsize=(18, 12))
    fig.suptitle(title, fontsize=16, fontweight='bold')
    
    for idx, country in enumerate(countries):
        ax = axes[idx]
        country_data = temporal_data[temporal_data['Country'] == country].copy()
        country_data['YearMonth'] = country_data['Date'].dt.to_period('M')
        
        monthly = country_data.groupby('YearMonth')[[f'{d}_z' for d in dimensions]].mean()
        monthly.columns = [c.replace('_z', '') for c in monthly.columns]
        monthly.index = monthly.index.to_timestamp()
        monthly_smooth = monthly.rolling(window=3, center=True).mean()
        
        for dim in monthly_smooth.columns:
            ax.plot(monthly_smooth.index, monthly_smooth[dim], linewidth=2.5, label=dim)
        
        ax.axhline(y=0, color='gray', linestyle='--', alpha=0.6)
        ax.set_title(country, fontweight='bold', loc='left')
        ax.set_xlabel('Year', fontweight='bold')
        ax.set_ylabel('Z-Score (3-month avg)', fontweight='bold')
        ax.grid(True, alpha=0.3)
        if idx == 0:
            ax.legend(loc='upper left', ncol=2)
    
    plt.tight_layout(rect=[0, 0, 1, 0.99])
    plt.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
    plt.show()

create_temporal_plot(LIWC_ALL, LIWC_benchmarks, ['Analytic', 'Authentic', 'Clout', 'Tone'],
                    ['Austria', 'Croatia', 'Great Britain'], 
                    'Temporal Evolution: Summary Variables', 'liwc_temporal_summary.png')

create_temporal_plot(LIWC_ALL, LIWC_benchmarks, ['politic', 'we', 'i', 'you'],
                    ['Austria', 'Croatia', 'Great Britain'],
                    'Temporal Evolution: Political Language', 'liwc_temporal_political.png')

## Final Save

Save processed datasets with all analysis columns.

In [None]:
# Save final datasets after all analysis
GB.to_pickle(r"data folder\GB\GB_final.pkl")
AT.to_pickle(r"data folder\AT\AT_final.pkl")
HR.to_pickle(r"data folder\HR\HR_final.pkl")

print("✅ All analysis complete!")
print(f"   - Topic classification evaluation")
print(f"   - LIWC linguistic analysis")
print(f"   - Final datasets saved")