In [2]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import re

In [3]:
parties = [
    "PVV", "GL-PvdA", "VVD", "NSC", "D66", "BBB", "CDA", "SP",
    "DENK", "PvdD", "FVD", "SGP", "CU", "Volt", "JA21"
]

In [None]:
import re
from collections import defaultdict
import os

# Read all .txt files in the "txt" folder
text = ""
txt_folder = "txt"
for filename in os.listdir(txt_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(txt_folder, filename), "r", encoding="utf-8") as f:
            text += f.read() + "\n"

# Match all speaker turns: speaker (PARTY): speech
# Capture (party, speech)
pattern = re.compile(
    r'(?:(?:De heer|Mevrouw)\s+[^:(]+?)\s+\(([^)]+)\):\s*'  # Match party name
    r'(.*?)(?=(?:De heer|Mevrouw)\s+[^:(]+?\s+\([^)]+\):|$)',  # Match speech up to next speaker
    re.DOTALL
)

party_speeches = defaultdict(str)

for match in pattern.finditer(text):
    party = match.group(1).strip()
    speech = match.group(2).strip()
    party_speeches[party] += speech + ' '

# Print the result
# Save the result to separate text files for each party
output_folder = "party_speeches"
os.makedirs(output_folder, exist_ok=True)

for party, speech in party_speeches.items():
    output_file = os.path.join(output_folder, f"{party}.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(speech.strip())
print(f"Party speeches have been saved to the folder: {output_folder}")


Party speeches have been saved to the folder: party_speeches


In [2]:
model = AutoModelForSequenceClassification.from_pretrained("manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2024-1-1")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

sentence = "We will restore funding to the Global Environment Facility and the Intergovernmental Panel on Climate Change, to support critical climate science research around the world"

inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )

logits = model(**inputs).logits

probabilities = torch.softmax(logits, dim=1).tolist()[0]
probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))
print(probabilities)
# {'501 - Environmental Protection: Positive': 67.56, '411 - Technology and Infrastructure': 14.03, '107 - Internationalism: Positive': 13.58, '416 - Anti-Growth Economy: Positive': 2.24...

predicted_class = model.config.id2label[logits.argmax().item()]
print(predicted_class)

config.json:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

{'414 - Economic Orthodoxy': 3.64, '601 - National Way of Life: Positive': 3.22, '702 - Labour Groups: Negative': 3.07, '705 - Underprivileged Minority Groups': 2.62, '703 - Agriculture and Farmers: Positive': 2.55, '405 - Corporatism/ Mixed Economy': 2.43, '104 - Military: Positive': 2.32, '411 - Technology and Infrastructure': 2.26, '410 - Economic Growth: Positive': 2.24, '106 - Peace': 2.22, '415 - Marxist Analysis: Positive': 2.22, '413 - Nationalisation': 2.21, '606 - Civic Mindedness: Positive': 2.19, '203 - Constitutionalism: Positive': 2.09, '605 - Law and Order: Positive': 2.02, '409 - Keynesian Demand Management': 2.01, '503 - Equality: Positive': 2.01, '701 - Labour Groups: Positive': 1.98, '402 - Incentives': 1.96, '401 - Free Market Economy': 1.92, '602 - National Way of Life: Negative': 1.92, '303 - Governmental and Administrative Efficiency': 1.91, '603 - Traditional Morality: Positive': 1.81, '604 - Traditional Morality: Negative': 1.81, '406 - Protectionism: Positive'