# Cultural Classification with Transformer Architecture

In [3]:
# Import for load dataset 
from CU_Dataset_Factory import Hf_Loader, Local_Loader, CU_Dataset_Factory
# Import Datases for work with Transformers by Hugging-Face
from datasets import Dataset
from datasets import Features
from datasets import Split, Value
from time import time

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

Choose appropriate features. Avaiable features are

* *'description'* - sintetic Wikidata description for item
* *'intro'* - Wikipedia page introduction
* *'full_page*' - full Wikipedia plain-text


In [4]:
fe = 'intro'

In [None]:
train_file      = "" #@param {type:"string"}
validation_file = "" #@param {type:"string"}

##############################################
# not modify this row for testing    purpose #
test_file       = "" #@param {type:"string"} #
##############################################

In [6]:
factory = CU_Dataset_Factory(f'./experiment_n{time()}')
train_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'train')
validation_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'validation')

In [7]:
train = factory.produce(train_l, 'tr_train.tsv', [fe], 'label', 45, False)
test  = factory.produce(validation_l, 'tr_validation.tsv', [fe], 'label', 45, False)

100%|██████████| 126/126 [00:01<00:00, 119.27it/s]
copy dataset: 100%|██████████| 1/1 [00:00<00:00, 758.19it/s]
intro: 100%|██████████| 6251/6251 [00:01<00:00, 4405.78it/s, batch=139]


OSError: Cannot save file into a non-existent directory: 'experiment_n1745949424.4012475'

In [None]:
train_data = train[['label', fe]].rename({fe: 'text'}, axis=1)
validation_data = test[['label', fe]].rename({fe: 'text'}, axis=1)

In [None]:
train.head(5)

## Model Selection

### Tested Models
We have tested major pretrained model using differente features, foreach we have reported accuracy value
* google/mobilebert-uncased ()
* microsoft/deberta-v3-xsmall (wiki_desc - 78%)
* microsoft/MiniLM-L12-H384-uncased
* distilbert/distilbert-base-uncased-distilled-squad

In [None]:
# Select best features
fe = 'intro'
model_repo = 'distilbert/distilbert-base-uncased-distilled-squad'

In [None]:
train_data = Dataset.from_pandas(train_data, features=Features({
    'label': Value('int32'),
    'text' : Value('string')
}), split=Split.TRAIN)

In [None]:
validation_data = Dataset.from_pandas(validation_data, features=Features({
    'label': Value('int32'),
    'text' : Value('string')
}), split=Split.TEST)

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=False)

In [None]:
class Preprocessor:
    def __init__(self, tokenizer) -> None:
        self.tokenizer = tokenizer
    def process_samples(self, samples):
        return samples.map(lambda sample: self.tokenizer(sample['text'], truncation=True, max_length=512))

In [None]:
p = Preprocessor(tokenizer)

In [None]:
tokenize_train = p.process_samples(train_data)
tokenize_test = p.process_samples(validation_data)

In [None]:
tokenize_train[3]

In [None]:
from transformers import DataCollatorWithPadding

In [None]:
collector = DataCollatorWithPadding(tokenizer, max_length=tokenizer.model_max_length)

In [None]:
import numpy as np
import evaluate

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average='micro')["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
cls2label = {0:'Cultural Agnostic', 1:'Cultural Rapresentative', 2:'Cultural Exclusive'}
label2cls = {l:c for c ,l in cls2label.items()}

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_repo, num_labels=3, ignore_mismatched_sizes=True)

In [None]:
#Parameters
epochs = 10
batch_size = 16
weight_decay = 0
learning_rate = 1e-4
out_dir = 'CU_with_DBert'
log = 'Cultural Analysis'

In [None]:
traning_args = TrainingArguments(
    output_dir=out_dir,
    eval_strategy='epoch',
    push_to_hub=False,
    num_train_epochs = epochs,
    per_device_train_batch_size=batch_size,
    warmup_steps=0,
    weight_decay=weight_decay,
    learning_rate=learning_rate,             
    report_to="none",
    logging_dir=log 
    
)

trainer = Trainer(model,traning_args, collector, tokenize_train, tokenize_test,tokenizer,compute_metrics=compute_metrics)

In [None]:
trainer.model.device

In [None]:
trainer.train()

In [None]:
from pathlib import PosixPath
from transformers import AutoModel, AutoTokenizer
from CU_Dataset_Factory import CU_Dataset_Factory
from Loader import Loader
from datasets import Dataset

import pandas as pd

class EvalTransform:
    def __init__(self, hf_id:str) -> None:
        
        self.df_path = PosixPath('.validation')
        self.model = AutoModel.from_pretrained(hf_id)
        self.pre_processor = CU_Dataset_Factory(self.df_path)
        self.tokenizer = AutoTokenizer.from_pretrained(hf_id)

        model.eval()



    def __dataProcess(self, loader:Loader, features):
        raw = self.pre_processor.produce(
            loader, 
            self.df_path, 
            enable_feature=features, 
            targe_feature='label', 
            batch_s=45)[['label', features]].rename({fe: 'text'}, axis=1)
        
        X = Dataset.from_pandas(raw, features=features({
            'label': Value('int32'),
            'text' : Value('string')
        }), split=Split.VALIDATION)

        y_pred = model(X)

        print(y_pred)

        r = pd.Dataframe()
        return r
    
    def eval(self,loader:Loader):
        self.__dataProcess(loader, ['intro'])



                


In [None]:
loader = Local_Loader('train.tsv')