In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import jensenshannon
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram
from collections import defaultdict
from sklearn.manifold import MDS

df = pd.read_csv('party_speeches_classification_cleaned.csv')

In [None]:
# Drop rows with no party
df = df.dropna(subset=['party'])

# Step 1: Build a topic distribution per speech
def build_topic_dist(row):
    dist = {}
    for i in range(1, 4):
        topic = row[f'top_{i}_topic']
        prob = row[f'top_{i}_prob']
        if isinstance(topic, str) and ' - ' in topic:
            topic_id = topic.split(' - ')[0].strip()
            dist[topic_id] = dist.get(topic_id, 0) + prob
    return dist

df['topic_dist'] = df.apply(build_topic_dist, axis=1)

# Step 2: Aggregate distributions per party
party_topic_counts = defaultdict(lambda: defaultdict(float))

for _, row in df.iterrows():
    party = row['party']
    for topic, prob in row['topic_dist'].items():
        party_topic_counts[party][topic] += prob

# Get all unique topics
all_topics = sorted({topic for counts in party_topic_counts.values() for topic in counts})

# Step 3: Normalize to probability distributions
party_dists = {}
for party, topic_count in party_topic_counts.items():
    vec = np.array([topic_count.get(topic, 0) for topic in all_topics])
    vec = vec / vec.sum()  # normalize
    party_dists[party] = vec

# Step 4: Compute pairwise Jensen-Shannon divergence
party_names = list(party_dists.keys())
jsd_matrix = pd.DataFrame(index=party_names, columns=party_names, dtype=float)

for p1, p2 in combinations(party_names, 2):
    d1 = party_dists[p1]
    d2 = party_dists[p2]
    jsd = jensenshannon(d1, d2, base=2)
    jsd_matrix.loc[p1, p2] = jsd
    jsd_matrix.loc[p2, p1] = jsd

np.fill_diagonal(jsd_matrix.values, 0.0)

# Optional: Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(jsd_matrix.astype(float), annot=True, cmap="coolwarm")
plt.title("Jensen-Shannon Divergence Between Parties")
plt.show()


In [None]:
linked = linkage(jsd_matrix, method='average')

plt.figure(figsize=(10, 7))
dendrogram(linked, labels=jsd_matrix.index, orientation='top')
plt.title("Party Clustering by Topic Distribution (JSD)")
plt.show()


In [None]:
issue_groups = {
    'Economy': {'412', '413', '415'},
    'Constitution': {'203', '204'},
    'Environment': {'601', '602'},
    'Immigration': {'701', '702'},
    'Social Policy': {'501', '502'},
    # Add more based on your full topic list
}


def get_issues_for_speech(row, issue_groups):
    issues = set()
    for i in range(1, 4):
        topic = row.get(f'top_{i}_topic', '')
        if isinstance(topic, str) and ' - ' in topic:
            topic_id = topic.split(' - ')[0].strip()
            for issue, topic_ids in issue_groups.items():
                if topic_id in topic_ids:
                    issues.add(issue)
    return list(issues)

df['issues'] = df.apply(get_issues_for_speech, axis=1, issue_groups=issue_groups)


issue_jsd_results = {}

for issue in issue_groups.keys():
    subset = df[df['issues'].apply(lambda x: issue in x)]
    if subset.empty:
        continue

    # Aggregate topic distributions per party
    party_topic_counts = defaultdict(lambda: defaultdict(float))
    
    for _, row in subset.iterrows():
        party = row['party']
        if pd.isna(party):
            continue
        for topic, prob in row['topic_dist'].items():
            party_topic_counts[party][topic] += prob

    all_topics = sorted({topic for counts in party_topic_counts.values() for topic in counts})

    party_dists = {}
    for party, topic_count in party_topic_counts.items():
        vec = np.array([topic_count.get(topic, 0) for topic in all_topics])
        if vec.sum() == 0:
            continue
        vec = vec / vec.sum()
        party_dists[party] = vec

    party_names = list(party_dists.keys())
    jsd_matrix = pd.DataFrame(index=party_names, columns=party_names, dtype=float)

    for p1 in party_names:
        for p2 in party_names:
            if p1 == p2:
                jsd = 0.0
            else:
                jsd = jensenshannon(party_dists[p1], party_dists[p2], base=2)
            jsd_matrix.loc[p1, p2] = jsd

    issue_jsd_results[issue] = jsd_matrix



import seaborn as sns
import matplotlib.pyplot as plt

for issue, matrix in issue_jsd_results.items():
    plt.figure(figsize=(8, 6))
    sns.heatmap(matrix.astype(float), annot=True, cmap='coolwarm')
    plt.title(f"JSD Between Parties on {issue}")
    plt.tight_layout()
    plt.show()



In [None]:
# Use the full JSD matrix (e.g., from earlier)
mds = MDS(n_components=1, dissimilarity='precomputed', random_state=42)
ideology_scores = mds.fit_transform(jsd_matrix.values)

# Attach scores to party names
ideology_df = pd.DataFrame({'party': jsd_matrix.index, 'ideology_score': ideology_scores.flatten()})
ideology_df = ideology_df.sort_values(by='ideology_score').reset_index(drop=True)
print(ideology_df)


In [None]:
issue_leaning = {
    'Economy': +1,           # right-leaning
    'Constitution': 0,       # neutral
    'Environment': -1,       # left
    'Immigration': +1,       # right
    'Social Policy': -1      # left
}


party_scores = defaultdict(list)

for issue, jsd_mat in issue_jsd_results.items():
    if issue not in issue_leaning:
        continue
    lean = issue_leaning[issue]
    avg_dists = jsd_mat.mean(axis=1)
    for party, dist in avg_dists.items():
        party_scores[party].append(lean * (1 - dist))  # closer = more aligned

# Final ideological score per party
ideology_scores = {party: np.mean(scores) for party, scores in party_scores.items()}
ideology_df = pd.DataFrame.from_dict(ideology_scores, orient='index', columns=['ideology_score']).sort_values(by='ideology_score')
print(ideology_df)


In [None]:
issue_leaning = {
    'Economy': +1,           # traditionally right
    'Constitution': 0,       # neutral
    'Environment': -1,       # traditionally left
    'Immigration': +1,       # traditionally right
    'Social Policy': -1      # traditionally left
}


from collections import defaultdict
import pandas as pd
import numpy as np

issue_party_scores = {}

for issue, topic_ids in issue_groups.items():
    subset = df[df['issues'].apply(lambda x: issue in x)]
    if subset.empty:
        continue

    # Aggregate topic distributions per party
    party_topic_counts = defaultdict(lambda: defaultdict(float))
    
    for _, row in subset.iterrows():
        party = row['party']
        if pd.isna(party):
            continue
        for topic, prob in row['topic_dist'].items():
            if topic in topic_ids:
                party_topic_counts[party][topic] += prob

    # Normalize per party
    party_dists = {}
    for party, topic_count in party_topic_counts.items():
        vec = np.array([topic_count.get(t, 0) for t in topic_ids])
        if vec.sum() == 0:
            continue
        vec = vec / vec.sum()
        party_dists[party] = vec

    # Compute mean usage for each party (as proxy for alignment)
    issue_scores = {}
    max_val = 0
    for party, vec in party_dists.items():
        score = vec.sum()  # total emphasis on this issue
        issue_scores[party] = score
        max_val = max(max_val, score)

    # Normalize & apply ideological direction
    if max_val > 0:
        for party in issue_scores:
            norm_score = issue_scores[party] / max_val
            issue_scores[party] = norm_score * issue_leaning.get(issue, 0)

    issue_party_scores[issue] = issue_scores



# Combine into a DataFrame
issue_score_df = pd.DataFrame(issue_party_scores).fillna(0).round(2)
issue_score_df = issue_score_df.sort_index()
print(issue_score_df)


In [None]:
# Define which topics belong to which ideological issue
issue_groups = {
    'Economy': {'412', '413', '415'},
    'Constitution': {'203', '204'},
    'Environment': {'601', '602'},
    'Immigration': {'701', '702'},
    'Social Policy': {'501', '502'},
}

# Define the leaning of each issue
issue_leaning = {
    'Economy': +1,           # traditionally right
    'Constitution': 0,       # neutral
    'Environment': -1,       # traditionally left
    'Immigration': +1,       # traditionally right
    'Social Policy': -1      # traditionally left
}

# Step 1: Aggregate topic probabilities per party per issue
party_issue_strengths = defaultdict(lambda: defaultdict(float))

for _, row in df.iterrows():
    party = row['party']
    if pd.isna(party):
        continue
    topic_dist = row['topic_dist']
    for topic_id, prob in topic_dist.items():
        for issue, issue_topic_ids in issue_groups.items():
            if topic_id in issue_topic_ids:
                party_issue_strengths[party][issue] += prob

# Step 2: Normalize scores per issue (not per party) and apply leaning
issue_party_scores = defaultdict(dict)

for issue in issue_groups:
    # Get raw scores for this issue
    raw_scores = {party: party_issue_strengths[party].get(issue, 0.0)
                  for party in party_issue_strengths}
    max_val = max(raw_scores.values()) if raw_scores else 1.0

    for party, score in raw_scores.items():
        normalized = score / max_val if max_val > 0 else 0
        ideological_score = normalized * issue_leaning.get(issue, 0)
        issue_party_scores[party][issue] = ideological_score

# Step 3: Convert to DataFrame
issue_score_df = pd.DataFrame(issue_party_scores).T.fillna(0).round(2)
issue_score_df = issue_score_df.sort_index()
print(issue_score_df)


In [None]:
def jensen_shannon_divergence(p, q):
    # Ensure numpy arrays and normalized
    p = np.array(p)
    q = np.array(q)
    p = p / p.sum()
    q = q / q.sum()
    return jensenshannon(p, q)**2  # scipy returns sqrt(JS), square it for divergence


In [None]:
# Collect topic vectors per party
party_topic_vecs = defaultdict(list)

# Assuming df['topic_dist'] is dict {topic_id_str: score}, scores sum to 1 per row or close
all_topics = set()
for _, row in df.iterrows():
    party = row.get('party')
    topic_dist = row.get('topic_dist')
    if pd.isna(party) or not isinstance(topic_dist, dict):
        continue
    party_topic_vecs[party].append(topic_dist)
    all_topics.update(topic_dist.keys())

all_topics = sorted(all_topics)  # fix order for vectorization

# Average topic distribution per party
avg_party_dist = {}
for party, dist_list in party_topic_vecs.items():
    # Create matrix: rows=documents, columns=topics
    mat = np.zeros((len(dist_list), len(all_topics)))
    for i, dist in enumerate(dist_list):
        for j, topic in enumerate(all_topics):
            mat[i, j] = dist.get(topic, 0)
    avg = mat.mean(axis=0)
    avg /= avg.sum()  # normalize to sum=1
    avg_party_dist[party] = avg


In [None]:
parties = sorted(avg_party_dist.keys())
n = len(parties)
jsd_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(n):
        if i == j:
            jsd_matrix[i, j] = 0
        elif i < j:
            jsd = jensen_shannon_divergence(avg_party_dist[parties[i]], avg_party_dist[parties[j]])
            jsd_matrix[i, j] = jsd
            jsd_matrix[j, i] = jsd


In [None]:
from scipy.spatial.distance import jensenshannon
from itertools import combinations

def mean_jsd_for_coalition(coalition, topic_vectors):
    """
    Compute mean Jensen-Shannon divergence for a set of parties.
    Lower means higher thematic alignment.
    """
    if len(coalition) < 2:
        return 0.0  # trivial case

    jsd_values = []
    for p1, p2 in combinations(coalition, 2):
        v1 = topic_vectors.get(p1)
        v2 = topic_vectors.get(p2)
        if v1 is not None and v2 is not None:
            jsd = jensenshannon(v1, v2, base=2)
            jsd_values.append(jsd)
    return np.mean(jsd_values) if jsd_values else 0.0