In [1]:
# install datasets to make the dataset suitable for transformers library
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
# Prepare datasets for both part 1 and part 2

import pandas as pd
from sklearn.model_selection import train_test_split

orientation_file = "orientation-tr-train.tsv"
power_file = "power-tr-train.tsv"

orientation_df = pd.read_csv(orientation_file, sep="\t")
power_df = pd.read_csv(power_file, sep="\t")

# perform train test split
def stratified_split(data, label_column, test_size=0.1, random_state=42):
    train_data, test_data = train_test_split(
        data,
        test_size=test_size,
        stratify=data[label_column],
        random_state=random_state
    )
    return train_data, test_data

orientation_train, orientation_test = stratified_split(orientation_df, label_column="label")
orientation_train.to_csv("orientation_train_split.tsv", sep="\t", index=False)
orientation_test.to_csv("orientation_test_split.tsv", sep="\t", index=False)

power_train, power_test = stratified_split(power_df, label_column="label")
power_train.to_csv("power_train_split.tsv", sep="\t", index=False)
power_test.to_csv("power_test_split.tsv", sep="\t", index=False)

print(orientation_train.shape)
print(orientation_test.shape)
print(power_train.shape)
print(power_test.shape)
print(orientation_df.head())
print("\n")
print(power_df.head())
print("\n")

# Check class distribution in the training and test datasets
print("Orientation Training Set Class Distribution:")
print(orientation_train['label'].value_counts(normalize=True))

print("\nOrientation Test Set Class Distribution:")
print(orientation_test['label'].value_counts(normalize=True))

print("\nPower Training Set Class Distribution:")
print(power_train['label'].value_counts(normalize=True))

print("\nPower Test Set Class Distribution:")
print(power_test['label'].value_counts(normalize=True))


(14524, 6)
(1614, 6)
(15645, 6)
(1739, 6)
        id                           speaker sex  \
0  tr00000  ca2031caa4032c51980160359953d507   M   
1  tr00001  4cee0addb3c69f6866869b180f90d45f   M   
2  tr00002  b3d7f76d74ec268492f8190ca123a6b2   M   
3  tr00003  722efac7138c8197a9d1e97eed3a8b18   M   
4  tr00004  be82a4ade406ec6774a0a2e38f6957e3   M   

                                                text  \
0  Yeni yasama döneminin ülkemiz için, milletimiz...   
1  Sayın Başkan, değerli milletvekilleri; bugün, ...   
2  Sayın Başkanım, öncelikle yüce Meclisin Başkan...   
3  24’üncü Dönem Meclis Başkanlığına seçilmenizde...   
4  24’üncü Yasama Dönemimizin tüm milletvekilleri...   

                                             text_en  label  
0  Mr. President, dear lawmakers, I salute you, a...      1  
1  Mr. President, members of lawmakers, as I spea...      1  
2  Mr. President, I'm here to share with you the ...      1  
3  Mr. President, under the principles determined...      1 

We can see that Orientation dataset has a mild imbalance, but Power dataset is pretty balanced, almost 50-50. Thus, the need to address class imbalance is rather useless. Also, in case of orientation, since our national council has more right parties than left ones, it makes sense to have more 1's than 0's.

# Part 1: Ideology Classification

In [3]:
# Load the XLM-RoBERTa model from HuggingFace
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
import wandb

model_name = "FacebookAI/xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

train_dataset = Dataset.from_pandas(orientation_train)
test_dataset = Dataset.from_pandas(orientation_test)

def tokenize(samples):
  return tokenizer(samples["text_en"],
                   truncation = True,
                   padding = True,
                   max_length = 512)

# Preprocess the train dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/14524 [00:00<?, ? examples/s]

Map:   0%|          | 0/1614 [00:00<?, ? examples/s]

In [4]:
# define training arguments
wandb.init(project="ParliamentaryDebates", name="task1-ideology")
training_args = TrainingArguments(
    output_dir="./results_task1",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs_task1",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="wandb",
    run_name="task1-ideology"
)

# define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# train the model
trainer.train()

eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

model.save_pretrained("./xlm-roberta-ideology")
tokenizer.save_pretrained("./xlm-roberta-ideology")

wandb.finish()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4004,0.368295,0.845725,0.828571,0.926518,0.874811
2,0.2195,0.314193,0.876084,0.902067,0.882854,0.892357
3,0.1822,0.335145,0.877943,0.905908,0.881789,0.893686


Evaluation Results: {'eval_loss': 0.3141925632953644, 'eval_accuracy': 0.8760842627013631, 'eval_precision': 0.9020674646354734, 'eval_recall': 0.8828541001064962, 'eval_f1': 0.8923573735199138, 'eval_runtime': 11.0304, 'eval_samples_per_second': 146.323, 'eval_steps_per_second': 9.157, 'epoch': 3.0}


VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁███
eval/f1,▁███
eval/loss,█▁▄▁
eval/precision,▁███
eval/recall,█▁▁▁
eval/runtime,█▁▅▇
eval/samples_per_second,▁█▄▂
eval/steps_per_second,▁█▄▂
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇██

0,1
eval/accuracy,0.87608
eval/f1,0.89236
eval/loss,0.31419
eval/precision,0.90207
eval/recall,0.88285
eval/runtime,11.0304
eval/samples_per_second,146.323
eval/steps_per_second,9.157
total_flos,1.146427490414592e+16
train/epoch,3.0


We see that the model accuracy is around 0.88 and F1 Score is around 0.89, which indicates good performance overall in the dataset.

# Part 2: Political Power Orientation

Most of the code will be reused from the first part.

In [5]:
# Load the XLM-RoBERTa model from HuggingFace
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
import wandb

model_name = "FacebookAI/xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

train_dataset = Dataset.from_pandas(power_train)
test_dataset = Dataset.from_pandas(power_test)

def tokenize(samples):
  return tokenizer(samples["text"],
                   truncation = True,
                   padding = True,
                   max_length = 512)

# Preprocess the train dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15645 [00:00<?, ? examples/s]

Map:   0%|          | 0/1739 [00:00<?, ? examples/s]

In [6]:
# define training arguments
wandb.init(project="ParliamentaryDebates", name="task2-power-orientation")
training_args = TrainingArguments(
    output_dir="./results_task2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs_task2",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="wandb",
    run_name="task2-power-orientation"
)

# define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# train the model
trainer.train()

eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

model.save_pretrained("./xlm-roberta-power")
tokenizer.save_pretrained("./xlm-roberta-power")
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mmerichaliloglu[0m ([33mmerichaliloglu-metu-middle-east-technical-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3264,0.28372,0.882116,0.923739,0.840045,0.879906
2,0.1764,0.359496,0.880391,0.93199,0.82774,0.876777
3,0.2448,0.357006,0.892467,0.893215,0.89821,0.895706


Evaluation Results: {'eval_loss': 0.2837204933166504, 'eval_accuracy': 0.8821161587119034, 'eval_precision': 0.923739237392374, 'eval_recall': 0.8400447427293065, 'eval_f1': 0.8799062683069713, 'eval_runtime': 11.8358, 'eval_samples_per_second': 146.928, 'eval_steps_per_second': 9.209, 'epoch': 3.0}


VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▂▁█▂
eval/f1,▂▁█▂
eval/loss,▁██▁
eval/precision,▇█▁▇
eval/recall,▂▁█▂
eval/runtime,▁▂█▄
eval/samples_per_second,█▇▁▅
eval/steps_per_second,█▇▁▅
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇███
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇██

0,1
eval/accuracy,0.88212
eval/f1,0.87991
eval/loss,0.28372
eval/precision,0.92374
eval/recall,0.84004
eval/runtime,11.8358
eval/samples_per_second,146.928
eval/steps_per_second,9.209
total_flos,1.23491173833216e+16
train/epoch,3.0


As we can see, we got approximately 0.88 accuracy and 0.88 F1 on this task, similar to the first task.