In [1]:
from CU_Dataset_Factory import Hf_Loader, CU_Dataset_Factory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
factory = CU_Dataset_Factory('.')
train_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'train')
validation_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'validation')

In [3]:
train = factory.produce(train_l, 'tr_train.tsv', ['description'], 'label', 10, False)
test  = factory.produce(validation_l, 'tr_validation.tsv', ['description'], 'label', 10, False)

100%|██████████| 126/126 [00:01<00:00, 125.93it/s]
copy dataset: 100%|██████████| 1/1 [00:00<00:00, 1055.97it/s]
batch compute: 100%|██████████| 6251/6251 [00:00<00:00, 39934.25it/s, batch=626]
100%|██████████| 6/6 [00:01<00:00,  5.25it/s]
copy dataset: 100%|██████████| 1/1 [00:00<00:00, 1492.10it/s]
batch compute: 100%|██████████| 300/300 [00:00<00:00, 14531.94it/s, batch=30]


In [4]:
train_data = train[['label', 'description']].rename({'description': 'text'}, axis=1)
validation_data = test[['label', 'description']].rename({'description': 'text'}, axis=1)

In [5]:
train.head(5)

Unnamed: 0,description,qid,wiki_name,label
0,2012 film by M. Mohanan,Q32786,916 (film),1
1,American dance-punk band from California,Q371,!!!,2
2,Mort & Phil comic,Q3729947,¡Soborno!,2
3,American band,Q158611,+44 (band),2
4,"building in Monmouth, Wales",Q280375,"1 Monk Street, Monmouth",1


In [6]:
from datasets import Dataset
from datasets import Features
from datasets import Split, Value

In [7]:
train_data = Dataset.from_pandas(train_data, features=Features({
    'label': Value('int32'),
    'text' : Value('string')
}), split=Split.TRAIN)

In [8]:
validation_data = Dataset.from_pandas(validation_data, features=Features({
    'label': Value('int32'),
    'text' : Value('string')
}), split=Split.TEST)

In [9]:
from transformers import AutoTokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [11]:
class Preprocessor:
    def __init__(self, tokenizer) -> None:
        self.tokenizer = tokenizer
    def process_samples(self, samples):
        return samples.map(lambda sample: self.tokenizer(sample['text'], truncation=True))

In [12]:
p = Preprocessor(tokenizer)

In [13]:
tokenize_train = p.process_samples(train_data)
tokenize_test = p.process_samples(validation_data)

Map: 100%|██████████| 6251/6251 [00:00<00:00, 16071.48 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 16123.67 examples/s]


In [14]:
from transformers import DataCollatorWithPadding

In [15]:
collector = DataCollatorWithPadding(tokenizer)

In [16]:
import numpy as np
import evaluate

In [17]:
def compute_metrics(eval_pred):
    acc = evaluate.load('accuracy')
    
    
    pred, true = eval_pred
    pred = np.argmax(pred, axis=1)
    return acc.compute(predictions=pred, references=true)

In [18]:
cls2label = {0:'Cultural Agnostic', 1:'Cultural Rapresentative', 2:'Cultural Exclusive'}
label2cls = {l:c for c ,l in cls2label.items()}

In [19]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [20]:
model = model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
traning_args = TrainingArguments(
    output_dir='CU_with_bert',
    eval_strategy='epoch',
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=traning_args,
    data_collator=collector,
    train_dataset=tokenize_train,
    eval_dataset=tokenize_test,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6782,0.699003,0.703333
2,0.4351,0.800173,0.703333
3,0.2914,0.968295,0.74


Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 5.05MB/s]


TrainOutput(global_step=2346, training_loss=0.45521617281264154, metrics={'train_runtime': 405.364, 'train_samples_per_second': 46.262, 'train_steps_per_second': 5.787, 'total_flos': 97452983823318.0, 'train_loss': 0.45521617281264154, 'epoch': 3.0})