# Cultural Classification with Transformer Architecture

# Working with Colab

In [None]:
# Mount Drive folder
from google.colab import drive
drive.mount('/content/drive/')


In [None]:
%cp -r drive/MyDrive/AIRO\ \S2/MNLP/MNLP_Homework1/mirror/ .
%cd mirror

In [None]:
# Import for load dataset 
from CU_Dataset_Factory import Hf_Loader, Local_Loader, CU_Dataset_Factory
# Import Datases for work with Transformers by Hugging-Face
from datasets import Dataset
from datasets import Features
from datasets import Split, Value
from transformers import EarlyStoppingCallback
from time import time

## Global Notebook Variables

In [None]:
is_train = True

## Load Dataset

Choose appropriate features. Avaiable features are

* *'description'* - sintetic Wikidata description for item
* *'intro'* - Wikipedia page introduction
* *'full_page*' - full Wikipedia plain-text


In [None]:
fe = 'intro'

In [None]:
train_file      = "train.csv" #@param {type:"string"}
validation_file = "validation.csv" #@param {type:"string"}

#################################################################
# not modify this row for testing       purpose                 #
test_file       = "tr_test.tsv"         #@param {type:"string"} #
#################################################################

In [None]:
factory = CU_Dataset_Factory(f'./experiment_n{time()}')
if is_train:
    train_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'train')
    validation_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'validation')

In [None]:
if is_train:
    train = factory.produce(train_l, 'tr_train.tsv', [fe], 'label', 10, False, False)
    validation  = factory.produce(validation_l, 'tr_validation.tsv', [fe], 'label', 10,False, False)

In [None]:
if is_train:
    train_data = train[['label', fe]].rename({fe: 'text'}, axis=1)
    validation_data = validation[['label', fe]].rename({fe: 'text'}, axis=1)

    # Prepare Dataset for the Model

    train_data = Dataset.from_pandas(train_data, features=Features({
        'label': Value('int32'),
        'text' : Value('string')
    }), split=Split.TRAIN)

    validation_data = Dataset.from_pandas(validation_data, features=Features({
        'label': Value('int32'),
        'text' : Value('string')
    }), split=Split.VALIDATION)

## Model Selection

In [None]:
# imports for Transformers
from transformers import AutoTokenizer # Datasets
from transformers import DataCollatorWithPadding

from transformers import AutoModelForSequenceClassification # Model
from transformers import TrainingArguments, Trainer

import numpy as np # eval
import evaluate

### Tested Models
We have tested major pretrained model using differente features, foreach we have reported accuracy value
* google/mobilebert-uncased (wiki_desc  - 72%)
* microsoft/deberta-v3-xsmall (wiki_desc - 78%)
* distilbert/distilbert-base-uncased-finetuned-sst-2-english (wiki_desc - 75%)
* microsoft/MiniLM-L12-H384-uncased
* distilbert/distilbert-base-uncased-distilled-squad

In [None]:
model_repo = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'
# my be customize the classification head after import

In [None]:
if is_train:
    model = AutoModelForSequenceClassification.from_pretrained(model_repo, num_labels=3, ignore_mismatched_sizes=True)
else:
    model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased-distilled-squad', num_labels=3, ignore_mismatched_sizes=True)
    pass

model.classifier

In [None]:
from torch import nn

class CU_Classifier(nn.Module):
    def __init__(self, config:dict[str, int]):
        super().__init__()

        self.l1 = nn.Sequential(
            nn.Linear(config['dim_embedding'], config['hidden_layers'], bias=True),
            nn.LayerNorm(config['hidden_layers']),
            nn.GELU(),
            nn.Dropout(),

            nn.Linear(config['hidden_layers'], config['hidden_layers'], bias=True),
            nn.LayerNorm(config['hidden_layers']),
            nn.GELU(),
            nn.Dropout(),

            nn.Linear(config['hidden_layers'], config['dim_embedding'], bias=True),
            nn.LayerNorm(config['dim_embedding']),
            nn.GELU(),  
            nn.Dropout()                             
        )
        
        self.out = nn.Linear(in_features=config['dim_embedding'], out_features=config['num_classes'])
    
    def forward(self, X):
        X = self.l1(X)
        X = self.out(X)

        return X


config = {
    'dim_embedding' : 768,
    'hidden_layers' : 900,
    'num_classes'   : 3
}


In [None]:
model.classifier = CU_Classifier(config) 

In [None]:
class Preprocessor:
    def __init__(self, tokenizer) -> None:
        self.tokenizer = tokenizer
    def process_samples(self, samples):
        return samples.map(lambda sample: self.tokenizer(sample['text'], truncation=True, max_length=512))

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average='micro')["f1"]
   return {"accuracy": accuracy, "f1": f1}

### Tokenization

Tokenize the text ...

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_repo)

In [None]:
p = Preprocessor(tokenizer)

In [None]:
if is_train:
    tokenize_train = p.process_samples(train_data)
    tokenize_validation = p.process_samples(validation_data)

In [None]:
collector = DataCollatorWithPadding(tokenizer)

## Tran and Evaluate the Network

### Train (enabled if `is_train` is True)

In [None]:
#Parameters
epochs = 5
batch_size = 16
weight_decay = 1e-4
learning_rate = 2e-5
out_dir = 'CU_with_DBert'
log = 'Cultural Analysis'

In [None]:
cls2label = {0:'Cultural Agnostic', 1:'Cultural Rapresentative', 2:'Cultural Exclusive'}
label2cls = {l:c for c ,l in cls2label.items()}

In [None]:
traning_args = TrainingArguments(
    output_dir=out_dir,
    eval_strategy='epoch',
    push_to_hub=False,
    num_train_epochs = epochs,
    per_device_train_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=weight_decay,
    learning_rate=learning_rate,             
    report_to="none",
    logging_dir=log,
)

In [None]:
if is_train:
    trainer = Trainer(model,traning_args, collector, tokenize_train, tokenize_validation,tokenizer,compute_metrics=compute_metrics)
    print(f'Model running on {trainer.model.device}')
    trainer.train()
    report = trainer.evaluate()
    print(report)

### Test Grade

In [None]:
import torch 
from torch import tensor
from torch.nn import Module
from pandas import Series
from datasets import Dataset
def predict_culture_pd(ds:Dataset, model:Module, tokenizer, device, max_length=128) -> Series:
    
    model = model.to(device)
    model.eval()
    encoding =   ds.map( lambda v: tokenizer(v['text'], return_tensors='pt', max_length=max_length, padding='max_length', truncation=True))
    input_ids = tensor( encoding['input_ids'] ).squeeze().to(device)
    attention_mask = tensor(encoding['attention_mask']).squeeze().to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
    labels = preds.numpy(force=True)
    return Series(labels)
    




In [None]:
loader = Local_Loader(test_file)
test = loader.get()
#test = factory.produce(loader, out_file=None, enable_feature=[fe], targe_feature=None, batch_s=45)
train_data = test[[fe]].rename({fe: 'text'}, axis=1)
train_ds = Dataset.from_pandas(train_data, features=Features({
    'text' : Value('string')
}), split=Split.TEST)

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased-distilled-squad')



In [None]:
y_pred = predict_culture_pd(train_ds, model, tokenizer, ('cuda' if torch.cuda.is_available() else 'cpu'))

test.insert(loc=len(test.columns), column='label', value=y_pred)


In [None]:
print(test[['item', 'name','label']].head(10))

In [None]:
#####################################
# Save file for evaluation purposes #
#####################################

test[['item', 'name','label']].to_csv('Many_Naps_Little_Progress_modello2.tsv', sep='\t', index=False)