## LLM-Based Classification of Political Parties based on structured meta-data from the Manifesto Project Database (MPDS)


uses environment: Python 3.12.1

In [1]:
import os
import pandas as pd
from enum import Enum
from typing import List
from pydantic import BaseModel, Field
from openai import OpenAI
import instructor
from dotenv import load_dotenv
from typing import Optional
from tqdm import tqdm

In [2]:
# Load environment variables from the .env file
load_dotenv()

# Get the API key from the environment
api_key = os.getenv("OPENAI_API_KEY")

# Patch OpenAI client with instructor
client = instructor.patch(OpenAI(api_key=api_key))

completion = client.chat.completions.create(
  model="gpt-4o",
  store=True,
  messages=[
    {"role": "user", "content": "write a haiku about ai"}
  ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Silent circuits hum,  \nData dreams woven in code—  \nMachine minds awaken.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None)


In [3]:
# Load CSV
brics = pd.read_csv("brics_articles.csv")

In [None]:
brics.head()

Unnamed: 0,text,source,date
0,"The BRICS bloc—Brazil, Russia, India, China, a...",Worldview Journal,2023-04-26
1,BRICS has made headlines for its ambitious pla...,Equinox Daily,2023-02-25
2,The BRICS alliance claims to champion multipol...,The Globe Today,2023-02-13
3,The BRICS alliance occupies a curious place in...,New Dawn Post,2023-12-22
4,"While BRICS garners headlines, the real story ...",The Continental Observer,2023-01-19


In [13]:
data = {
    "partyname": [
        "Scottish National Party",
        "UK Independence Party",
        "Conservative Party",
        "Basque Nationalist Party",
        "South Tyrolean People's Party",
        "Reform UK"
    ],
    "party": [
        11111,
        22222,
        33333,
        44444,
        55555,
        66666
    ],
    "partyabbrev": [
        "SNP",
        "UKIP",
        "Cons",
        "PNV",
        "SVP",
        "Reform UK"
    ],
    "year": [
        2012,
        2024,
        1992,
        1982,
        1972,
        2024
    ],
    "countryname": [
        "United Kingdom",
        "United Kingdom",
        "United Kingdom",
        "Spain",
        "Italy",
        "United Kingdom"
    ]
}

df = pd.DataFrame(data)

df.head(10)

Unnamed: 0,partyname,party,partyabbrev,year,countryname
0,Scottish National Party,11111,SNP,2012,United Kingdom
1,UK Independence Party,22222,UKIP,2024,United Kingdom
2,Conservative Party,33333,Cons,1992,United Kingdom
3,Basque Nationalist Party,44444,PNV,1982,Spain
4,South Tyrolean People's Party,55555,SVP,1972,Italy
5,Reform UK,66666,Reform UK,2024,United Kingdom


# REAL DATA: #



In [3]:
# Load CSV
parties = pd.read_csv("merged_corpus.csv")
parties.head()

Unnamed: 0,text,manifesto_id,language,translation_en,year,country,countryname,party,partyname,partyabbrev
0,VALMANIFEST Huvuduppgifterna i höstens val är...,11220_1960,swedish,False,1960,11,Sweden,11220,Communist Party of Sweden,SKP
1,Utdrag ur valtidningen AVGÖRANDET INFÖR 60-TA...,11320_1960,swedish,False,1960,11,Sweden,11320,Social Democratic Labour Party,SAP
2,MÖJLIGHETERNAS ÅRTIONDE Sextiotalet är möjli...,11420_1960,swedish,False,1960,11,Sweden,11420,People’s Party,FP
3,Utdrag ur valbroschyren: HÖGERALTERNATIVET - ...,11620_1960,swedish,False,1960,11,Sweden,11620,Right Party,
4,CENTERPARTIETS Valprogram inför valet den 18 ...,11810_1960,swedish,False,1960,11,Sweden,11810,Centre Party,CP


# QUERY #

In [10]:
from typing import Optional
from pydantic import BaseModel, Field

class Nationalist(str, Enum):
    NATIONALIST = "nationalist"
    NOT_NATIONALIST = "not_nationalist"

class TextClassification(BaseModel):
    nationalist: Nationalist
    region_name: Optional[str] = Field(
        default=None,
        description="Name of the substate region the party represents, if applicable. Otherwise 'none' or null."
    )
    confidence: float = Field(ge=0, le=1)
    explanation: Optional[str] = Field(
        description="Brief reasoning (1–2 sentences) supporting the classification."
    )
  

SYSTEM_PROMPT_TEXT = """
You are a political science expert specializing in European political parties and nationalism since 1945.

Your task is to classify political parties based on structured metadata (party name, abbreviation, country, and year). For each party, provide a structured classification in the following format:

---

Output schema:
{
  "nationalist": "nationalist" | "not_nationalist",
  "region_name": "<name of substate region>" | "none",
  "confidence": <float from 0 to 1>,
  "explanation": "<1–2 sentence justification>"
}
---

Classification Instructions:

1. **Is the party nationalist?**
   - Return `"nationalist"` if the party emphasizes national sovereignty, prioritizes the interests of the national in-group or frames its politics around protecting or restoring the nation’s identity.
   - A party can be nationalist even if it does **not** seek independence or represent a substate region.

2. **If the party is nationalist, does it represent a specific substate region?**
   - If yes, return the **name of the region** (e.g., "Basque Country", "Catalonia", "South Tyrol", "Scotland", etc.).
   - If not, return `"none"`.

3. **Provide a confidence score** (between 0.0 and 1.0) reflecting how certain you are.

4. **Provide a short explanation** (1–2 sentences) explaining your classification, focusing on key ideological cues or known political positions from the given year.

Do **not** infer information beyond the party-year. If you are unsure, reflect that uncertainty in a lower confidence score.

---

Few-Shot Examples:

Example 1:
Party name: Basque Nationalist Party  
Abbreviation: PNV  
Country: Spain  
Year: 2018  

Response:
{
  "nationalist": "nationalist",
  "region_name": "Basque Country",
  "confidence": 0.95,
  "explanation": "The PNV is a Basque nationalist party seeking greater autonomy for the Basque Country within Spain."
}

Example 2:
Party name: Scottish National Party  
Abbreviation: SNP  
Country: United Kingdom  
Year: 2020  

Response:
{
  "nationalist": "nationalist",
  "region_name": "Scotland",
  "confidence": 0.97,
  "explanation": "The SNP is a civic nationalist party advocating Scottish independence through democratic means."
}

Example 3:
Party name: Socialist Party  
Abbreviation: PS  
Country: France  
Year: 1997  

Response:
{
  "nationalist": "not_nationalist",
  "region_name": "none",
  "confidence": 0.90,
  "explanation": "The PS is a mainstream social-democratic party focused on national governance, not nationalism."
}

Example 4:
Party name: Südtiroler Volkspartei  
Abbreviation: SVP  
Country: Italy  
Year: 1995  

Response:
{
  "nationalist": "nationalist",
  "region_name": "South Tyrol",
  "confidence": 0.93,
  "explanation": "The SVP represents the German-speaking population of South Tyrol and advocates regional autonomy."
}

Example 5:
Party name: National Front  
Abbreviation: FN  
Country: France  
Year: 2012  

Response:
{
  "nationalist": "nationalist",
  "region_name": "none",
  "confidence": 0.92,
  "explanation": "The FN is a far-right nationalist party focused on ethnic identity and national sovereignty, not regional autonomy."
}

"""

def classify_party(party_name: str, abbrev: str, year: int, country: str) -> TextClassification:
    user_input = f"""Party name: {party_name}\nAbbreviation: {abbrev}\nCountry: {country}\nYear: {year}"""
    response = client.chat.completions.create(
        model="gpt-4o",
        response_model=TextClassification,
        temperature=0,
        max_retries=3,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT_TEXT},
            {"role": "user", "content": user_input}
        ]
    )
    return response


In [11]:
# Classify each row

from typing import Tuple

def classify_party_row(row) -> Tuple[str, str, float, str]:
    try:
        result = classify_party(
            party_name=row['partyname'],
            abbrev=row['partyabbrev'],
            year=int(row['year']),
            country=row['countryname']
        )
        return (
            result.nationalist.value,
            result.region_name if result.region_name else "none",
            result.confidence,
            result.explanation if result.explanation else ""
        )
    except Exception as e:
        # Log or return fallback values if there's an error
        return "error", "error", 0.0, str(e)

In [33]:
# Load CSV
parties = pd.read_csv("merged_corpus.csv")
parties.head()
print(len(parties))

2315


In [25]:
parties = parties.sample(n=250, random_state=42)
print(len(parties))

250


In [34]:
from tqdm import tqdm
tqdm.pandas()

# Apply row-wise and unpack results into new columns
parties[['nationalist', 'region_name', 'confidence', 'explanation']] = (
    parties.progress_apply(lambda row: pd.Series(classify_party_row(row)), axis=1)
)

parties.head()

100%|██████████| 2315/2315 [1:13:25<00:00,  1.90s/it]


Unnamed: 0,text,manifesto_id,language,translation_en,year,country,countryname,party,partyname,partyabbrev,nationalist,region_name,confidence,explanation
0,VALMANIFEST Huvuduppgifterna i höstens val är...,11220_1960,swedish,False,1960,11,Sweden,11220,Communist Party of Sweden,SKP,not_nationalist,none,0.85,The Communist Party of Sweden in 1960 was focu...
1,Utdrag ur valtidningen AVGÖRANDET INFÖR 60-TA...,11320_1960,swedish,False,1960,11,Sweden,11320,Social Democratic Labour Party,SAP,not_nationalist,none,0.85,The SAP is a mainstream social-democratic part...
2,MÖJLIGHETERNAS ÅRTIONDE Sextiotalet är möjli...,11420_1960,swedish,False,1960,11,Sweden,11420,People’s Party,FP,not_nationalist,none,0.85,"The People's Party in Sweden, known as the Lib..."
3,Utdrag ur valbroschyren: HÖGERALTERNATIVET - ...,11620_1960,swedish,False,1960,11,Sweden,11620,Right Party,,not_nationalist,none,0.75,The Right Party in Sweden during 1960 was prim...
4,CENTERPARTIETS Valprogram inför valet den 18 ...,11810_1960,swedish,False,1960,11,Sweden,11810,Centre Party,CP,not_nationalist,none,0.85,The Centre Party in Sweden is a centrist polit...


In [36]:
print(parties['region_name'].value_counts(dropna=False))

region_name
none                    2198
Flanders                  21
Catalonia                 17
Basque Country            12
Canary Islands             8
Northern Ireland           7
Wallonia                   7
Republika Srpska           7
Vojvodina                  5
Scotland                   5
Galicia                    5
Slavonia and Baranja       3
Istria                     3
Wales                      3
South Tyrol                2
Sandžak                    2
Aragon                     2
Ticino                     2
Geneva                     2
Aosta Valley               1
Andalusia                  1
Cantabria                  1
Sandzak                    1
Name: count, dtype: int64


In [4]:
print(parties['nationalist'].value_counts(dropna=False))

nationalist
not_nationalist    1907
nationalist         408
Name: count, dtype: int64


In [None]:
#parties.to_csv("parties_regions_classification.csv", index=False)

In [2]:
parties = pd.read_csv("parties_regions_classification.csv")

In [3]:
print(len(parties))

2315


In [3]:
# Filter for nationalist and regionally-identified parties
regionalist_nationalist = parties[
    (parties['nationalist'] == 'nationalist') &
    (parties['region_name'].notnull()) &
    (parties['region_name'].str.lower() != 'none')
]

# Count number of party-years per region
region_counts = (
    regionalist_nationalist
    .groupby(['countryname', 'region_name'])
    .size()
    .reset_index(name='n_party_years')
    .sort_values(by='n_party_years', ascending=False)
)

# Display the table
print(region_counts)

           countryname           region_name  n_party_years
0              Belgium              Flanders             21
15               Spain             Catalonia             17
12               Spain        Basque Country             12
13               Spain        Canary Islands              8
2   Bosnia-Herzegovina      Republika Srpska              7
1              Belgium              Wallonia              7
19      United Kingdom      Northern Ireland              7
16               Spain               Galicia              5
9               Serbia             Vojvodina              5
20      United Kingdom              Scotland              5
3              Croatia                Istria              3
4              Croatia  Slavonia and Baranja              3
21      United Kingdom                 Wales              3
6                Italy           South Tyrol              2
18         Switzerland                Ticino              2
17         Switzerland                Ge

Token Count

In [None]:
import tiktoken


In [16]:
#Input tokens

encoding = tiktoken.encoding_for_model("gpt-4o")  # or "gpt-4o"

system_prompt = SYSTEM_PROMPT_TEXT  # your full SYSTEM_PROMPT_TEXT
user_prompt = f"""
Party: 12345
Party Name: Basque Nationalist Party
Abbreviation: PNV
Country: Spain
Year: 2004
"""

tokens = encoding.encode(system_prompt)
system_tokens = len(tokens)

tokens = encoding.encode(user_prompt)
user_tokens = len(tokens)

total_tokens = system_tokens + user_tokens
print(f"Total tokens: {total_tokens}")

Total tokens: 655


In [19]:
# Output tokens

encoding = tiktoken.encoding_for_model("gpt-4o")

def estimate_output_tokens(row):
    output_text = f"""
    category: {row['nationalist']}
    category_type: {row['nationalist_type']}
    subcategory: {row['regionalist']}
    subcategory_type: {row['regionalist_type']}
    ideology: {row['ideology']}
    electoral_success: {row['electoral_success']}
    confidence: {row['confidence']}
    key_information: {row['key_information']}
    """.strip()
    return len(encoding.encode(output_text))

# Apply to all rows
results_df['estimated_output_tokens'] = results_df.apply(estimate_output_tokens, axis=1)

# Average output token count
avg_output_tokens = results_df['estimated_output_tokens'].mean()
print(f"Average output tokens per request: {avg_output_tokens:.1f}")

Average output tokens per request: 91.2


calculates the average number of output tokens across all rows =  91.2

GPT-4o-mini - Input tokens price (per million tokens = $0.15) 
            - Output tokens price (per million tokens = $0.60) 

For 800 observations: Input: 800 * 655 * 0.15 / 1,000,000 = 0.01179
                      Output: 800 * 91.2 * 0.60 / 1,000,000 = 0.043776
Total = $0.0556



GPT-4o  - Input tokens price (per million tokens = $2.5)
        - Output tokens price (per million tokens = $10) 

For 800 observations: Input: 800*655*2.5/1,000,000 = 1.31
                      Output: 800*91.2*10/1,000,000 = 0.73
Total = $2.04

2343 observations:

🧠 GPT-4o-mini
Input tokens: 2,343 × 655 × $0.15 / 1,000,000
= $0.22998

Output tokens: 2,343 × 91.2 × $0.60 / 1,000,000
= $0.12828

Total cost (GPT-4o-mini) ≈ $0.36

🧠 GPT-4o
Input tokens: 2,343 × 655 × $2.5 / 1,000,000
= $3.83266

Output tokens: 2,343 × 91.2 × $10 / 1,000,000
= $2.138376

Total cost (GPT-4o) ≈ $5.97

Log failed requests to retry later if needed.

# Real Application #

In [19]:
# Classify each row

def row_to_prompt(row):
    return f"""
    Party Name: {row['partyname']}
    Country: {row['countryname']}
    Year: {row['year']}
    """.strip()


# Classify each row with a progress bar
results = []
for _, row in tqdm(parties.iterrows(), total=len(parties), desc="Classifying parties"):
    party_prompt = row_to_prompt(row)
    try:
        classification = classify_text(party_prompt)
        results.append({
            "partyname": row["partyname"],
            "party": row["party"],
            "year": row["year"],
            "countryname": row["countryname"],
            "nationalist": classification.category,
            "nationalist_type": classification.category_type,
            "regionalist": classification.subcategory,
            "regionalist_type": classification.subcategory_type,
            "ideology": classification.ideology,
            "electoral_success": classification.electoral_success,
            "confidence": classification.confidence,
            "key_information": ", ".join(classification.key_information),
        })
        
    except Exception as e:
        print(f"Failed on {row['partyname']} ({row['year']}, {row['countryname']}): {e}")

# Create DataFrame with results
parties_df = pd.DataFrame(results)

# Save to CSV
#results_df.to_csv("brics_articles_classified.csv", index=False)

Classifying parties: 100%|██████████| 2315/2315 [1:02:47<00:00,  1.63s/it]


In [18]:


len(parties)

2315

In [20]:
parties_df.head()

Unnamed: 0,partyname,party,year,countryname,nationalist,nationalist_type,regionalist,regionalist_type,ideology,electoral_success,confidence,key_information
0,Communist Party of Sweden,11220,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.FAR_LEFT,SuccessLevel.LOW,0.9,"Communist Party, Sweden, 1960, far-left ideolo..."
1,Social Democratic Labour Party,11320,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.LEFT,SuccessLevel.HIGH,0.9,"Social Democratic, Labour Party, Sweden, 1960"
2,People’s Party,11420,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.CENTER,SuccessLevel.MODERATE,0.9,"People’s Party, Sweden, 1960"
3,Right Party,11620,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.RIGHT,SuccessLevel.LOW,0.9,"Right Party, Sweden, 1960"
4,Centre Party,11810,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.CENTER,SuccessLevel.MODERATE,0.9,"Centre Party, Sweden, 1960"


In [23]:
len(parties_df)

2315

In [24]:
# Merge text column from 'parties' into 'parties_df'
merged_df = parties_df.merge(
    parties[['party', 'year', 'text']],
    on=['party', 'year'],
    how='left'
)

In [None]:
merged_df.head()

Unnamed: 0,partyname,party,year,countryname,nationalist,nationalist_type,regionalist,regionalist_type,ideology,electoral_success,confidence,key_information,text
0,Communist Party of Sweden,11220,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.FAR_LEFT,SuccessLevel.LOW,0.9,"Communist Party, Sweden, 1960, far-left ideolo...",VALMANIFEST Huvuduppgifterna i höstens val är...
1,Social Democratic Labour Party,11320,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.LEFT,SuccessLevel.HIGH,0.9,"Social Democratic, Labour Party, Sweden, 1960",Utdrag ur valtidningen AVGÖRANDET INFÖR 60-TA...
2,People’s Party,11420,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.CENTER,SuccessLevel.MODERATE,0.9,"People’s Party, Sweden, 1960",MÖJLIGHETERNAS ÅRTIONDE Sextiotalet är möjli...
3,Right Party,11620,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.RIGHT,SuccessLevel.LOW,0.9,"Right Party, Sweden, 1960",Utdrag ur valbroschyren: HÖGERALTERNATIVET - ...
4,Centre Party,11810,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.CENTER,SuccessLevel.MODERATE,0.9,"Centre Party, Sweden, 1960",CENTERPARTIETS Valprogram inför valet den 18 ...


In [26]:
len(parties_df)

2315

In [27]:
parties_df = merged_df

In [28]:
parties_df.head()

Unnamed: 0,partyname,party,year,countryname,nationalist,nationalist_type,regionalist,regionalist_type,ideology,electoral_success,confidence,key_information,text
0,Communist Party of Sweden,11220,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.FAR_LEFT,SuccessLevel.LOW,0.9,"Communist Party, Sweden, 1960, far-left ideolo...",VALMANIFEST Huvuduppgifterna i höstens val är...
1,Social Democratic Labour Party,11320,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.LEFT,SuccessLevel.HIGH,0.9,"Social Democratic, Labour Party, Sweden, 1960",Utdrag ur valtidningen AVGÖRANDET INFÖR 60-TA...
2,People’s Party,11420,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.CENTER,SuccessLevel.MODERATE,0.9,"People’s Party, Sweden, 1960",MÖJLIGHETERNAS ÅRTIONDE Sextiotalet är möjli...
3,Right Party,11620,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.RIGHT,SuccessLevel.LOW,0.9,"Right Party, Sweden, 1960",Utdrag ur valbroschyren: HÖGERALTERNATIVET - ...
4,Centre Party,11810,1960,Sweden,Nationalist.NOT_NATIONALIST,,,,Ideology.CENTER,SuccessLevel.MODERATE,0.9,"Centre Party, Sweden, 1960",CENTERPARTIETS Valprogram inför valet den 18 ...


In [29]:
parties_df.to_csv("parties_df.csv", index=False)