# Few-Shot Prompting LLM Measurement of European Party Ideologies - Text Scaling on Ideological Dimensions

In [15]:
import os
import pandas as pd
from enum import Enum
from typing import List
from pydantic import BaseModel, Field
from openai import OpenAI
import instructor
from dotenv import load_dotenv
from typing import Optional
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import time
import tiktoken
from tqdm import tqdm
from typing import Tuple




In [2]:
#DATA

# Load CSV
eu_batch_1 = pd.read_csv("eu_batch_1.csv", encoding='latin1')
eu_batch_1.head()

print(len(eu_batch_1))

321


In [24]:
#Full sample size: 3778 as in Burton et al. 2020

eu_batch_1_sample = eu_batch_1.sample(n=30, random_state=42)
eu_batch_1_sample.head()

Unnamed: 0,partyname,party,year,countryname,nationalist,nationalist_type,regionalist,regionalist_type,ideology,electoral_success,confidence,key_information,text,manifesto_id,txt
173,Northern League,32720,1994,Italy,nationalist,ethnic_nationalism,regionalist,secessionism,right,high,0.9,"Northern League, Italy, 1994, national soverei...",SINTESI PROGRAMMA ECONOMIA DEBITO PUBBLICO 1...,32720_1994,
132,Soldiers of Destiny,53620,1992,Ireland,nationalist,,,,right,marginal,0.8,"Soldiers of Destiny, Ireland, 1992",FIANNA FAIL â¢ THE REPUBLICAN PARTY THE SlX-...,53620_1992,
197,Basque Country Unite,33095,2016,Spain,nationalist,ethnic_nationalism,regionalist,autonomy,center_left,moderate,0.85,"Basque Country, national identity, regional au...",Oportunidades para: posibilitar un cambio real...,33095_2016,
9,Flemish Interest,21917,2014,Belgium,nationalist,ethnic_nationalism,regionalist,secessionism,far_right,high,0.9,"Flemish nationalism, Focus on the interests of...","1. Vlaanderen, staat in Europa en de wereld Si...",21917_2014,
104,Family of the Irish,53520,1977,Ireland,nationalist,,,,center,low,0.9,"Family of the Irish, Ireland, 1977",THE NATIONAL COALITION GOVERNMENT. ... TO ACH...,53520_1977,


# LLM Query

In [5]:
# For replication: insert API key here

# Load environment variables from the .env file
load_dotenv()

# Get the API key from the environment
api_key = os.getenv("OPENAI_API_KEY")

# Patch OpenAI client with instructor
client = instructor.patch(OpenAI(api_key=api_key))

completion = client.chat.completions.create(
  model="gpt-4o-mini",
  store=True,
  messages=[
    {"role": "user", "content": "write a haiku about ai"}
  ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Silent circuits hum,  \nAwakening thoughts anew—  \nDreams shaped by the code.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None)


In [None]:
# 1. Enum Definitions
class EthnicSentiment(str, Enum):
    absent = "absent"
    weak = "weakly_present"
    moderate = "moderately_present"
    strong = "strongly_present"
    very_strong = "very_strongly_present"
    unclear = "unclear"

class CivicSentiment(str, Enum):
    absent = "absent"
    weak = "weakly_present"
    moderate = "moderately_present"
    strong = "strongly_present"
    very_strong = "very_strongly_present"
    unclear = "unclear"

# 2. Classification Output Schema
class TextClassification(BaseModel):
    ethnic_category: EthnicSentiment
    civic_category: CivicSentiment
    confidence: float = Field(ge=0, le=1, description="Confidence score for the classification")
    explanation: Optional[str] = Field(description="Brief explanation of why the categories were chosen")

# 3. Updated System Prompt
SYSTEM_PROMPT_TEXT = """
You are a political scientist with expertise in nationalism studies. Your task is to classify excerpts from European nationalist party manifestos based on the presence and intensity of **ethnic nationalism** and **civic nationalism**.

---

**Definitions**:

- **Ethnic Nationalism**: Reflects exclusionary or anti-immigrant views, especially those emphasizing ancestry, bloodline, or cultural purity.
- **Civic Nationalism**: Reflects inclusive views toward immigrants, emphasizing legal equality, shared civic values, and integration.

---

**Classification Instructions**:

For each text:
1. Classify the intensity of both **ethnic** and **civic** nationalist sentiment:
   - `absent`
   - `weakly_present`
   - `moderately_present`
   - `strongly_present`
   - `very_strongly_present`
   - `unclear`
   
2. Include:
   - A `confidence` score (0.0 to 1.0)
   - A short explanation (1–2 sentences) justifying your classification.

---

**Examples**:

Example 1 (Ethnic Nationalism):
> "We must preserve the purity of our national blood and protect our heritage from foreign dilution."

Output:
```json
{
  "ethnic_category": "very_strongly_present",
  "civic_category": "absent",
  "confidence": 0.95,
  "explanation": "The text explicitly emphasizes racial purity and exclusion of foreigners."
}

Example 2 (Civic Nationalism):

"Everyone who shares our values and contributes to our society should be considered one of us, regardless of origin."

Output:
```json
{
  "ethnic_category": "absent",
  "civic_category": "strongly_present",
  "confidence": 0.9,
  "explanation": "The statement clearly supports inclusion based on civic criteria."
}

 Example 3 (Unclear):

"Our nation must remain strong and united."
{
  "ethnic_category": "unclear",
  "civic_category": "unclear",
  "confidence": 0.4,
  "explanation": "The language is too vague to confidently identify ethnic or civic nationalism."
}
"""
# Classification function
def classify_text(text: str) -> TextClassification:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=TextClassification,
        temperature=0,
        max_retries=2,
        messages=[
            {
                "role": "system",
                "content": SYSTEM_PROMPT_TEXT,
            },
            {"role": "user", "content": text}
        ]
    )
    return response

In [22]:
tqdm.pandas()  # progress bar 

# --- Adjust to your OpenAI API quota ---
TOKENS_PER_MINUTE = 2000000 
REQUESTS_PER_MINUTE = 5000  # OpenAI's request limit per model
MODEL = "gpt-4o-mini"

def estimate_tokens(text: str, model: str = MODEL) -> int:
    enc = tiktoken.encoding_for_model(model)
    return len(enc.encode(text)) + 100  # 100-token buffer for response + metadata

last_request_time = time.time()
tokens_used_last_minute = []
requests_last_minute = []

def classify_row_with_throttle(text: str) -> Tuple[str, str, float, str]:
    global last_request_time, tokens_used_last_minute, requests_last_minute

    # Estimate token usage
    token_estimate = estimate_tokens(text)

    now = time.time()

    # Clean out entries older than 60s
    tokens_used_last_minute = [
        (t, tokens) for (t, tokens) in tokens_used_last_minute if now - t < 60
    ]
    requests_last_minute = [t for t in requests_last_minute if now - t < 60]

    total_tokens_used = sum(tokens for _, tokens in tokens_used_last_minute)

    # Wait if over token or request limit
    while total_tokens_used + token_estimate > TOKENS_PER_MINUTE or len(requests_last_minute) >= REQUESTS_PER_MINUTE:
        time.sleep(1)
        now = time.time()
        tokens_used_last_minute = [
            (t, tokens) for (t, tokens) in tokens_used_last_minute if now - t < 60
        ]
        requests_last_minute = [t for t in requests_last_minute if now - t < 60]
        total_tokens_used = sum(tokens for _, tokens in tokens_used_last_minute)

    # Make API call
    try:
        result = classify_text(text)
        tokens_used_last_minute.append((now, token_estimate))
        requests_last_minute.append(now)

        return (
            result.ethnic_category.value,
            result.civic_category.value,
            result.confidence,
            result.explanation,
        )
    except Exception as e:
        return "error", "error", 0.0, str(e)

In [27]:
eu_batch_1[['ethnic_llm', 'civic_llm', 'llm_confidence', 'llm_explanation']] = (
    eu_batch_1['text']
    .progress_apply(lambda x: pd.Series(classify_row_with_throttle(x)))
)

eu_batch_1.head()

100%|██████████| 321/321 [17:33<00:00,  3.28s/it]


Unnamed: 0,partyname,party,year,countryname,nationalist,nationalist_type,regionalist,regionalist_type,ideology,electoral_success,confidence,key_information,text,manifesto_id,txt,ethnic_llm,civic_llm,llm_confidence,llm_explanation
0,Flemish Interest,21917,1989,Belgium,,,,,,,,,| VLAAMS G_ VB. 24 | PROGRAMMA _ VERKIEZINGEN ...,21914_1989,2.0,very_strongly_present,absent,0.95,The manifesto strongly emphasizes the preserva...
1,Flemish Interest,21914,1991,Belgium,nationalist,ethnic_nationalism,regionalist,secessionism,far_right,high,0.9,"Flemish Bloc, Belgium, 1991, emphasis on Flemi...",Uit zelfverdediging. Vlaams Blok. Eigen volk e...,21914_1991,,very_strongly_present,absent,0.95,The manifesto strongly emphasizes the superior...
2,Flemish Interest,21917,1994,Belgium,,,,,,,,,Programmabrochure âa â ee ee il il La a Wi...,21914_1994,2.0,very_strongly_present,absent,0.95,The manifesto strongly emphasizes ethnic ident...
3,Flemish Interest,21914,1995,Belgium,nationalist,ethnic_nationalism,regionalist,secessionism,far_right,high,0.9,"Flemish Bloc, Belgium, 1995, national sovereig...",Nu afrekenen! Vlaams Blok Eigen volk eerst ...,21914_1995,,very_strongly_present,absent,0.95,The manifesto strongly emphasizes ethnic natio...
4,Flemish Interest,21917,1999,Belgium,,,,,,,,,##21914\r\n\r\n\r\nOnze visie op Europa en het...,21914_1999,1.0,strongly_present,weakly_present,0.85,The text emphasizes the importance of Flemish ...


In [28]:
eu_batch_1.to_csv("eu_batch_1_llm.csv", index=False)

In [30]:
print(eu_batch_1['ethnic_llm'].value_counts(dropna=False))

print(eu_batch_1['civic_llm'].value_counts(dropna=False))

ethnic_llm
strongly_present         113
absent                   105
weakly_present            62
very_strongly_present     21
moderately_present        17
error                      3
Name: count, dtype: int64
civic_llm
strongly_present         169
moderately_present        80
weakly_present            54
absent                    14
error                      3
very_strongly_present      1
Name: count, dtype: int64


In [None]:
# Filter out rows with 'unclear'
eu_batch_1_sample = eu_batch_1_sample[eu_batch_1_sample['llm_prediction'] != 'unclear'].copy()

# Step 2: Map the remaining labels
eu_batch_1_sample['llm_prediction'] = eu_batch_1_sample['llm_prediction'].map({
    'moral_sentiment': 'moral',
    'not_moral_sentiment': 'non-moral'
})
eu_batch_1_sample.head()

Unnamed: 0,corpus,clean_text,diffusion,retweet_count,followers_count,verified,media_type,urls_url,hashtags,ME,moral,emo,XYZcount,n_char,llm_prediction,llm_confidence
36567,covid,herd immunity first there is no vaccine second...,4,4,489,0,,,0,0,0,0,5,252,non-moral,0.8
20423,covid,my aunt just sent me this from peru people are...,2852,2852,599,0,,,1,0,1,0,1,120,non-moral,0.8
45069,covid,officials are warning we need to be especially...,10,10,2394,0,,,1,0,0,0,2,192,non-moral,0.9
74238,covid,a century later trash collection is as reliabl...,74,74,931,0,,,0,0,0,0,5,202,non-moral,0.8
6201,covid,boeing chief executive dave calhoun said tuesd...,38,38,2086,0,,,0,0,1,0,8,255,non-moral,0.8


In [None]:
eu_batch_1_sample.to_csv("eu_batch_1_llm.csv", index=False)

In [None]:
print(eu_batch_1_sample['llm_prediction'].value_counts(dropna=False))

llm_prediction
non-moral    818
moral        176
Name: count, dtype: int64


In [None]:
# dimensions of the sample
print(f"Sample size: {len(eu_batch_1_sample)}")

Sample size: 994


In [None]:
eu_batch_1_sample.to_csv("eu_batch_1_llm.csv", index=False)