In [16]:
%%capture
!pip install transformers

In [17]:
%%capture
!pip install datasets

In [18]:
import pandas as pd

In [19]:
df = pd.read_csv("data/manifestos_clean.csv")
df['sentence_num'] = df.groupby('manifesto').cumcount() + 1
df['sentence_id'] = df['manifesto'] + '_' + df['sentence_num'].astype(str)

In [20]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (AutoModelForSequenceClassification, AutoTokenizer)
from tqdm import tqdm


# Define the CAP_NUM_DICT
CAP_NUM_DICT = {0: '1', 1: '2', 2: '3', 3: '4', 4: '5', 5: '6',
                6: '7', 7: '8', 8: '9', 9: '10', 10: '12', 11: '13',
                12: '14', 13: '15', 14: '16', 15: '17', 16: '18',
                17: '19', 18: '20', 19: '21', 20: '23', 21: '999'}



# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForSequenceClassification.from_pretrained(
    'poltextlab/xlm-roberta-large-german-party-cap-v3',
    num_labels=len(CAP_NUM_DICT),
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True
)

# Function to tokenize the dataset
def tokenize_dataset(data: pd.DataFrame, text_column: str):
    tokenized = tokenizer(data[text_column].tolist(),
                          max_length=128,  # Adjust MAXLEN as needed
                          truncation=True,
                          padding="max_length",
                          return_tensors='pt')
    return tokenized

# Function to classify new texts
def classify_texts(df: pd.DataFrame, text_column: str):
    # Tokenize the dataset with progress bar
    tokenized_dataset = tokenize_dataset(df, text_column)

    # Perform inference with progress bar
    predictions = []
    with torch.no_grad():
        for i in tqdm(range(0, len(df), 8), desc="Classifying"):
            batch = {key: val[i:i+8] for key, val in tokenized_dataset.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions.append(logits)

    # Concatenate predictions
    logits = torch.cat(predictions, dim=0)

    # Get predictions
    predicted_labels = np.argmax(logits.numpy(), axis=1)
    predicted_df = pd.DataFrame(predicted_labels, columns=['predicted']).replace({'predicted': CAP_NUM_DICT})

    # Merge predictions with the original DataFrame
    result_df = df.reset_index(drop=True).join(predicted_df)

    return result_df



In [21]:
df

Unnamed: 0,party,text,year,manifesto,sentence_num,sentence_id
0,afd,www.afd.de/europawahlprogramm EUROPAWAHLPROGRA...,2019,afd_2019,1,afd_2019_1
1,afd,Dazu gehört insbesondere ein möglichst unbehin...,2019,afd_2019,2,afd_2019_2
2,afd,Die quasistaatliche „Europäische Union“ halten...,2019,afd_2019,3,afd_2019_3
3,afd,Eine Gruppe benachbarter Staaten kann sehr gut...,2019,afd_2019,4,afd_2019_4
4,afd,"Der Versuch jedoch, aus derzeit 28 oder noch m...",2019,afd_2019,5,afd_2019_5
...,...,...,...,...,...,...
16797,spd,Unser Europa ist sozial und ökologisch nachhal...,2024,spd_2024,971,spd_2024_971
16798,spd,"Unser Europa ist gerecht, auch in seiner Außen...",2024,spd_2024,972,spd_2024_972
16799,spd,Unser Europa ist stark und handlungsfähig.,2024,spd_2024,973,spd_2024_973
16800,spd,Für eine bessere Zukunft und für unsere geme...,2024,spd_2024,974,spd_2024_974


In [22]:
# Classify the texts
classified_df = classify_texts(df, 'text')

Classifying: 100%|██████████| 2101/2101 [31:59<00:00,  1.09it/s]


In [23]:
classified_df.value_counts("predicted")

predicted
19    3646
1     1377
7     1262
2     1196
15    1176
17     929
9      734
5      711
8      645
12     617
10     566
18     547
20     529
4      523
16     506
13     490
3      465
6      437
14     266
23     171
21       9
Name: count, dtype: int64

In [24]:
classified_df.to_csv("euromanifesto_sentences_cap_classified.csv")