In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
df = pd.read_csv('eu_speeches_all_2025-10-16.csv')

## FINAL

In [None]:
import os
import json
import textwrap
import time
import pandas as pd
from openai import AzureOpenAI

# ==========================================================
# 0. AZURE SETUP  (use the SETTINGS THAT ALREADY WORK FOR YOU)
# ==========================================================
endpoint = "https://paul-miptq7ew-swedencentral.cognitiveservices.azure.com/openai/deployments/gpt-5-nano-try2/chat/completions?api-version=2025-01-01-preview"
subscription_key = "API KEY HERE"
deployment = "gpt-5-nano"      # this is what you used in the working test

client = AzureOpenAI(
    api_version="2024-10-01-preview",   # keep the version that worked
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

# ==========================================================
# 1. LOAD DATA AND SAMPLE 1000 SPEECHES
# ==========================================================
csv_path = "eu_speeches_all_2025-10-16.csv"
df = pd.read_csv(csv_path)

print("DataFrame columns:", df.columns.tolist())
print("Total rows:", len(df))

N_SAMPLE = 1000
sample = df.sample(N_SAMPLE, random_state=42).copy()
print(f"Sampling {N_SAMPLE} speeches for LLM scoring...")

# add empty columns for scores
sample["conflict_score"] = None
sample["outgroup_tone_score"] = None
sample["extremity_score"] = None
sample["llm_notes"] = None

# ==========================================================
# 2. LLM INSTRUCTIONS + PROMPT BUILDER
# ==========================================================
CLASSIFIER_SYS = """
You are a political text coder.

Given a parliamentary speech, you must output ONLY a JSON object in the following format:

{
  "conflict_score": <0-10 integer>,
  "outgroup_tone_score": <0-10 integer>,
  "extremity_score": <0-10 integer>,
  "notes": "<brief English note>"
}

Definitions:
- conflict_score: 0 = fully consensual or technical; 10 = highly confrontational, us-vs-them, constant clash.
- outgroup_tone_score: 0 = respectful or no clear outgroup; 10 = openly hostile to specific parties, groups, or institutions.
- extremity_score: 0 = cautious, moderate language; 10 = extreme, alarmist, maximalist language or demands.

Do NOT output anything except valid JSON.
"""

def build_user_prompt(text, language, group, macro_topic):
    text_snip = textwrap.shorten(str(text), width=1200, placeholder=" [...]")
    return f"""
Metadata:
- Language: {language}
- Group: {group}
- Macro-topic: {macro_topic}

Speech:
```{text_snip}```

Return ONLY the JSON object.
"""

# ==========================================================
# 3. SAFE REQUEST FUNCTION WITH RETRIES
# ==========================================================
def call_model(prompt, retries=3, sleep_between=1.0):
    last_raw = ""
    for attempt in range(1, retries + 1):
        try:
            resp = client.chat.completions.create(
                model=deployment,
                messages=[
                    {"role": "system", "content": CLASSIFIER_SYS},
                    {"role": "user", "content": prompt},
                ],
                max_completion_tokens=30000,  
                temperature=1,                # same as in the working test
            )
            raw = resp.choices[0].message.content
            last_raw = raw
            if raw and raw.strip():
                return raw
            print(f"[Attempt {attempt}] Empty response — retrying in {sleep_between}s...")
        except Exception as e:
            print(f"[Attempt {attempt}] ERROR from model: {e}")
        time.sleep(sleep_between)
    # if we get here, all attempts failed
    print("[call_model] Failed after retries. Last raw content repr:", repr(last_raw))
    return ""


DataFrame columns: ['id', 'sitting_id', 'date', 'speech_order', 'speaker_name', 'political_group', 'title', 'speech_content', 'language', 'topic', 'macro_topic', 'specific_focus']
Total rows: 163079
Sampling 1000 speeches for LLM scoring...


In [63]:

# ==========================================================
# 4. RUN OVER 1000 SPEECHES AND STORE SCORES
# ==========================================================
total = len(sample)
for i, (idx, row) in enumerate(sample.iterrows(), start=1):
    print("=" * 80)
    print(f"Processing {i}/{total}  |  ID {row['id']}  |  Group {row['political_group']}  |  Lang {row['language']}")
    print("Excerpt:")
    print(str(row["speech_content"])[:300], "\n")

    prompt = build_user_prompt(
        row["speech_content"],
        row["language"],
        row["political_group"],
        row["macro_topic"],
    )

    raw = call_model(prompt)
    if not raw:
        print("⚠️  No usable response, skipping row.")
        continue

    try:
        parsed = json.loads(raw)
    except Exception as e:
        print("⚠️  JSON parse error:", e)
        print("Raw was:", repr(raw))
        continue

    # store in dataframe
    sample.loc[idx, "conflict_score"] = parsed.get("conflict_score")
    sample.loc[idx, "outgroup_tone_score"] = parsed.get("outgroup_tone_score")
    sample.loc[idx, "extremity_score"] = parsed.get("extremity_score")
    sample.loc[idx, "llm_notes"] = parsed.get("notes")

    # quick progress print
    print("Scores:",
          "conflict =", parsed.get("conflict_score"),
          "| outgroup_tone =", parsed.get("outgroup_tone_score"),
          "| extremity =", parsed.get("extremity_score"))

    # optional: tiny sleep to be nice to the rate limit
    # time.sleep(0.05)

# convert score columns to numeric (in case some are None/strings)
for col in ["conflict_score", "outgroup_tone_score", "extremity_score"]:
    sample[col] = pd.to_numeric(sample[col], errors="coerce")

# ==========================================================
# 5. SAVE RESULTS
# ==========================================================
out_csv = "eu_speeches_sample1000_scored.csv"
sample.to_csv(out_csv, index=False)
print(f"\nSaved scored sample to {out_csv}")

# ==========================================================
# 6. FIND MOST EXTREME SPEECHES PER DIMENSION
# ==========================================================
def print_top_extreme(df_in, column, top_n=10):
    print("\n" + "#" * 80)
    print(f"TOP {top_n} SPEECHES BY {column.upper()} (highest first)")
    print("#" * 80)
    df_sorted = df_in.dropna(subset=[column]).sort_values(column, ascending=False).head(top_n)

    for j, (_, r) in enumerate(df_sorted.iterrows(), start=1):
        print(f"\n[{j}] ID {r['id']} | date {r['date']} | group {r['political_group']} | lang {r['language']}")
        print(f"{column} = {r[column]}  |  other scores: "
              f"conflict={r['conflict_score']}, "
              f"outgroup={r['outgroup_tone_score']}, "
              f"extremity={r['extremity_score']}")
        print("Notes:", r["llm_notes"])
        print("Excerpt:")
        print(str(r["speech_content"])[:400])
        print("-" * 80)

print_top_extreme(sample, "conflict_score", top_n=10)
print_top_extreme(sample, "outgroup_tone_score", top_n=10)
print_top_extreme(sample, "extremity_score", top_n=10)

print("\nAll done.")


Processing 1/1000  |  ID 646674  |  Group PPE  |  Lang CS
Excerpt:
Paní předsedající, vážená paní komisařko, dámy a pánové, všichni víme, že ochrana vnějších hranic patří mezi naše základní úkoly. Pokud má existovat bezpečný volný pohyb uvnitř schengenského prostoru, nemáme jinou možnost než stoprocentně zajistit bezpečnost vnější hranice. Myslím, že o to se země n 

Scores: conflict = 3 | outgroup_tone = 2 | extremity = 3
Processing 2/1000  |  ID 643987  |  Group S&D  |  Lang PL
Excerpt:
Pani Przewodnicząca! Zegar tyka. Za 7 lat i 47 dni wzrost globalnej temperatury przekroczy punkt krytyczny. Książki, z których jeszcze kilka lat temu uczyliśmy się w szkole geografii, dzisiaj są kompletnie nieaktualne. Do takiej katastrofy niestety doprowadził człowiek. Dlatego Europejski Zielony Ła 

Scores: conflict = 4 | outgroup_tone = 2 | extremity = 6
Processing 3/1000  |  ID 710232  |  Group nan  |  Lang FR
Excerpt:
UKIP voted against this measure because this was a proposal for legislation. UK

In [64]:
print_top_extreme(sample, "conflict_score", top_n=10)
print_top_extreme(sample, "outgroup_tone_score", top_n=10)
print_top_extreme(sample, "extremity_score", top_n=10)


################################################################################
TOP 10 SPEECHES BY CONFLICT_SCORE (highest first)
################################################################################

[1] ID 633991 | date 2021-03-10 | group NI | lang HR
conflict_score = 9.0  |  other scores: conflict=9.0, outgroup=9.0, extremity=8.0
Notes: Croatian speech attacking EU leadership and a 'campaign of fear'; strong us-vs-them rhetoric, calls for drastic change, with alarmist claims about deaths and unemployment.
Excerpt:
Poštovana predsjedavajuća, poštovani kolege, poštovani građani, vodstvo Europske unije je svojim neodgovornim ponašanjem, neodgovornim odlukama i ponajviše nečinjenjem dovelo do toga da su izgubljeni milijuni radnih mjesta. Izgubljeni su milijuni života koji su posljedica neobavljanja hitnih kirurških operacija, sve pod famoznom kampanjom straha kojom je uništeno gospodarstvo. Socijalne države i s
---------------------------------------------------------------