In [26]:
from CU_Dataset_Factory import Hf_Loader, CU_Dataset_Factory

In [27]:
factory = CU_Dataset_Factory('.')
train_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'train')
validation_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'validation')

In [28]:
fe = 'full_page'
model_repo = 'xlm-roberta-base'

In [29]:
train = factory.produce(train_l, 'tr_train.tsv', [fe], 'label', 45, False)
test  = factory.produce(validation_l, 'tr_validation.tsv', [fe], 'label', 45, False)

100%|██████████| 126/126 [00:01<00:00, 109.22it/s]
copy dataset: 100%|██████████| 1/1 [00:00<00:00, 1160.57it/s]
full_page: 100%|██████████| 6251/6251 [00:09<00:00, 688.28it/s, batch=139]
100%|██████████| 6/6 [00:00<00:00,  6.08it/s]
copy dataset: 100%|██████████| 1/1 [00:00<00:00, 2202.89it/s]
full_page: 100%|██████████| 300/300 [00:00<00:00, 716.12it/s, batch=7]


In [30]:
train_data = train[['label', fe]].rename({fe: 'text'}, axis=1)
validation_data = test[['label', fe]].rename({fe: 'text'}, axis=1)

In [31]:
train.head(5)

Unnamed: 0,full_page,wiki_name,qid,label
0,916 is a 2012 Indian Malayalam-language drama...,916 (film),Q32786,1
1,"!!! ( ch(i)k-ch(i)k-ch(i)k), also known as Chk...",!!!,Q371,2
2,¡Soborno! (English: Bribery!) is a 1977 comic ...,¡Soborno!,Q3729947,2
3,+44 (read as Plus Forty-four) was an American ...,+44 (band),Q158611,2
4,"1 Monk Street, Monmouth was built as a Working...","1 Monk Street, Monmouth",Q280375,1


In [32]:
from datasets import Dataset
from datasets import Features
from datasets import Split, Value

In [33]:
train_data = Dataset.from_pandas(train_data, features=Features({
    'label': Value('int32'),
    'text' : Value('string')
}), split=Split.TRAIN)

In [34]:
validation_data = Dataset.from_pandas(validation_data, features=Features({
    'label': Value('int32'),
    'text' : Value('string')
}), split=Split.TEST)

In [35]:
from transformers import AutoTokenizer

In [36]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

In [None]:
class Preprocessor:
    def __init__(self, tokenizer) -> None:
        self.tokenizer = tokenizer
    def process_samples(self, samples):
        return samples.map(lambda sample: self.tokenizer(sample['text'], truncation=True, padding=True))

In [38]:
p = Preprocessor(tokenizer)

In [39]:
tokenize_train = p.process_samples(train_data)
tokenize_test = p.process_samples(validation_data)

Map: 100%|██████████| 6251/6251 [00:38<00:00, 163.77 examples/s]
Map: 100%|██████████| 300/300 [00:03<00:00, 93.37 examples/s] 


In [40]:
from transformers import DataCollatorWithPadding

In [41]:
collector = DataCollatorWithPadding(tokenizer)

In [42]:
import numpy as np
import evaluate

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [44]:
cls2label = {0:'Cultural Agnostic', 1:'Cultural Rapresentative', 2:'Cultural Exclusive'}
label2cls = {l:c for c ,l in cls2label.items()}

In [45]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [46]:
model = model = AutoModelForSequenceClassification.from_pretrained(model_repo, num_labels=3, ignore_mismatched_sizes=True)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Parameters
epochs = 5
batch_size = 32
weight_decay = 1e-4
learning_rate = 0.001
out_dir = 'CU_with_roberta'
log = 'Cultural Analysis'

In [None]:
traning_args = TrainingArguments(
    output_dir='CU_with_bert',
    eval_strategy='epoch',
    push_to_hub=False,
    num_train_epochs = 10,
    
)

trainer = Trainer(
    output_dir=out_dir,                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",
    learning_rate=learning_rate,                  # learning rate
    report_to="none",
    logging_dir=log         # use it later to get the training curves
)

In [48]:
trainer.model.device

device(type='cuda', index=0)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0625,1.105826,0.253333
2,1.0374,0.826584,0.58
3,0.8479,0.907471,0.523333


In [None]:
# Evaluate the model ...
trainer.evaluate()