# Cultural Classification with Transformer Architecture

In [1]:
# Import for load dataset 
from CU_Dataset_Factory import Hf_Loader, Local_Loader, CU_Dataset_Factory
# Import Datases for work with Transformers by Hugging-Face
from datasets import Dataset
from datasets import Features
from datasets import Split, Value
from time import time

  from .autonotebook import tqdm as notebook_tqdm


## Global Notebook Variables

In [2]:
is_train = False

## Load Dataset

Choose appropriate features. Avaiable features are

* *'description'* - sintetic Wikidata description for item
* *'intro'* - Wikipedia page introduction
* *'full_page*' - full Wikipedia plain-text


In [3]:
fe = 'description'

In [4]:
train_file      = "train.csv" #@param {type:"string"}
validation_file = "validation.csv" #@param {type:"string"}

######################################################
# not modify this row for testing    purpose         #
test_file       = "tr_test.tsv" #@param {type:"string"} #
######################################################

In [5]:
factory = CU_Dataset_Factory(f'./experiment_n{time()}')
if is_train:
    train_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'train')
    validation_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'validation')

In [6]:
if is_train:
    train = factory.produce(train_l, 'tr_train.tsv', [fe], 'label', 45, True, False)
    validation  = factory.produce(validation_l, 'tr_validation.tsv', [fe], 'label', 45,True, False)

In [7]:
if is_train:
    train_data = train[['label', fe]].rename({fe: 'text'}, axis=1)
    validation_data = validation[['label', fe]].rename({fe: 'text'}, axis=1)

    # Prepare Dataset for the Model

    train_data = Dataset.from_pandas(train_data, features=Features({
        'label': Value('int32'),
        'text' : Value('string')
    }), split=Split.TRAIN)

    validation_data = Dataset.from_pandas(validation_data, features=Features({
        'label': Value('int32'),
        'text' : Value('string')
    }), split=Split.VALIDATION)

## Model Selection

In [8]:
# imports for Transformers
from transformers import AutoTokenizer # Datasets
from transformers import DataCollatorWithPadding

from transformers import AutoModelForSequenceClassification # Model
from transformers import TrainingArguments, Trainer

import numpy as np # eval
import evaluate

### Tested Models
We have tested major pretrained model using differente features, foreach we have reported accuracy value
* google/mobilebert-uncased ()
* microsoft/deberta-v3-xsmall (wiki_desc - 78%)
* microsoft/MiniLM-L12-H384-uncased
* distilbert/distilbert-base-uncased-distilled-squad

In [9]:
model_repo = 'distilbert/distilbert-base-uncased-distilled-squad'
# my be customize the classification head after import

In [32]:
if is_train:
    model = AutoModelForSequenceClassification.from_pretrained(model_repo, num_labels=3, ignore_mismatched_sizes=True)
else:
    model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased-distilled-squad', num_labels=3, ignore_mismatched_sizes=True)
    pass

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased-distilled-squad and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
class Preprocessor:
    def __init__(self, tokenizer) -> None:
        self.tokenizer = tokenizer
    def process_samples(self, samples):
        return samples.map(lambda sample: self.tokenizer(sample['text'], truncation=True, max_length=512))

In [12]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average='micro')["f1"]
   return {"accuracy": accuracy, "f1": f1}

### Tokenization

Tokenize the text ...

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=False)

In [14]:
p = Preprocessor(tokenizer)

In [15]:
if is_train:
    tokenize_train = p.process_samples(train_data)
    tokenize_validation = p.process_samples(validation_data)

In [16]:
collector = DataCollatorWithPadding(tokenizer, max_length=tokenizer.model_max_length)

## Tran and Evaluate the Network

### Train (enabled if `is_train` is True)

In [17]:
#Parameters
epochs = 1
batch_size = 1
weight_decay = 0
learning_rate = 1e-4
out_dir = 'CU_with_DBert'
log = 'Cultural Analysis'

In [18]:
cls2label = {0:'Cultural Agnostic', 1:'Cultural Rapresentative', 2:'Cultural Exclusive'}
label2cls = {l:c for c ,l in cls2label.items()}

In [19]:
traning_args = TrainingArguments(
    output_dir=out_dir,
    eval_strategy='epoch',
    push_to_hub=False,
    num_train_epochs = epochs,
    per_device_train_batch_size=batch_size,
    warmup_steps=0,
    weight_decay=weight_decay,
    learning_rate=learning_rate,             
    report_to="none",
    logging_dir=log 
    
)



In [29]:
if is_train:
    trainer = Trainer(model,traning_args, collector, tokenize_train, tokenize_validation,tokenizer,compute_metrics=compute_metrics)
    print(f'Model running on {trainer.model.device}')
    trainer.train()
    report = trainer.evaluate()
    print(report)

### Test Grade

In [61]:
import pandas as pd
import torch 

from torch import tensor
from torch.nn import Module
from pandas import DataFrame
from pandas import Series
from datasets import Dataset
def predict_culture_pd(ds:Dataset, model:Module, tokenizer, device, max_length=128) -> Series:
    
    model = model.to(device)
    model.eval()
    encoding =   ds.map( lambda v: tokenizer(v['text'], return_tensors='pt', max_length=max_length, padding='max_length', truncation=True))
    input_ids = tensor( encoding['input_ids'] ).squeeze().to(device)
    attention_mask = tensor(encoding['attention_mask']).squeeze().to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
    labels = preds.numpy(force=True)
    return Series(labels)
    




In [64]:
loader = Local_Loader(test_file)
test = loader.get()
#test = factory.produce(loader, out_file=None, enable_feature=[fe], targe_feature=None, batch_s=45)
train_data = test[[fe]].rename({fe: 'text'}, axis=1)
train_ds = Dataset.from_pandas(train_data, features=Features({
    'text' : Value('string')
}), split=Split.TEST)

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased-distilled-squad')



In [None]:
y_pred = predict_culture_pd(train_ds, model, tokenizer, ('cuda' if torch.cuda.is_available() else 'cpu'))

test.insert(loc=len(test.columns), column='label', value=y_pred)


Map: 100%|██████████| 300/300 [00:00<00:00, 4928.89 examples/s]


       qid                    wiki_name
0   Q15786               1. FC Nürnberg
1  Q268530                   77 Records
2  Q216153                 A Bug's Life
3     Q593                 A Gang Story
4  Q192185                Aaron Copland
5  Q265890             Aarwangen Castle
6  Q305718                        Abaya
7  Q337267        Academy of San Carlos
8      Q15                       Africa
9  Q388170  African-American literature


In [66]:
print(test[['qid', 'wiki_name','label']].head(10))

       qid                    wiki_name  label
0   Q15786               1. FC Nürnberg      1
1  Q268530                   77 Records      1
2  Q216153                 A Bug's Life      1
3     Q593                 A Gang Story      1
4  Q192185                Aaron Copland      1
5  Q265890             Aarwangen Castle      1
6  Q305718                        Abaya      1
7  Q337267        Academy of San Carlos      1
8      Q15                       Africa      1
9  Q388170  African-American literature      1


In [67]:
#####################################
# Save file for evaluation purposes #
#####################################

test.to_csv('results_TransformersNetwork.tsv', sep='\t')