# Cultural Classification with Transformer Architecture (Network II)🔥

In [1]:
# Mount Drive folder
from google.colab import drive
drive.mount('/content/drive/')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
%cp -r /content/drive/MyDrive/Many_Naps_Little_Progress/src/* . # Copy source files in env space
%cp -r /content/drive/MyDrive/Many_Naps_Little_Progress/*.*sv . # Copy datasets in env space
%cp /content/drive/MyDrive/Many_Naps_Little_Progress/colab_install.sh .
%ls

## Env Configuration

Install additional libs required for traning/testing

In [None]:
!bash colab_install.sh >> /dev/null

## Hugging-Face Login 🤗

login on hugging face (use to download pre-trained network)

In [None]:
from huggingface_hub import login

HF_TOKEN = "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"
login(token=HF_TOKEN)

  from .autonotebook import tqdm as notebook_tqdm


## Import Necessary Libraries 📚

In [1]:
# Import Datases to work with Transformers by Hugging-Face
from datasets import Dataset
import pandas as pd

# Imports for Transformers
from transformers import AutoTokenizer  # Datasets
from transformers import DataCollatorWithPadding

from transformers import AutoModelForSequenceClassification  # Model
from transformers import TrainingArguments, Trainer

import numpy as np  # Evaluation
import evaluate

from torch import nn
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


## Global Notebook Variables

### Load the Dataset

Choose appropriate features. Available features are:

* *'description'* - synthetic Wikidata description
* *'intro'* - Wikipedia page introduction
* *'full_page*' - full Wikipedia plain-text


In [2]:
is_train = True

In [3]:
###########################################
fe = "intro"  # @param {type:"string"}    #
###########################################

In [4]:
if fe == "full_page":
    train = pd.read_csv("f_tr_train.csv")
    validation = pd.read_csv("f_tr_validation.csv")
    test = pd.read_csv("f_tr_test.csv")
elif fe == "intro":
    train = pd.read_csv("i_tr_train.csv")
    validation = pd.read_csv("i_tr_validation.csv")
    test = pd.read_csv("i_tr_test.csv")
elif fe == "description":
    train = pd.read_csv("d_tr_train.csv")
    validation = pd.read_csv("d_tr_validation.csv")
    test = pd.read_csv("d_tr_test.csv")

In [5]:
if is_train:
    # Creare i DataFrame
    train_data = train.dropna(how='any', axis=0)[["label", fe]].rename(columns={fe: "text"})
    validation_data = validation.dropna(how='any', axis=0)[["label", fe]].rename(columns={fe: "text"})
    # Convertire i DataFrame in Dataset Hugging Face
    train_hf = Dataset.from_pandas(
        train_data,
        preserve_index=False
        
    )

    validation_hf = Dataset.from_pandas(
        validation_data,
        preserve_index=False
      
    )

test_data = test.dropna(how='any', axis=0)[[fe]].rename(columns={fe: "text"})
test_hf = Dataset.from_pandas(
        test_data,
        preserve_index=False
      
    )

In [6]:
print(train_hf['text'][0])


916  is a 2012 Indian Malayalam-language drama film written and directed by M. Mohanan, starring  Mukesh,  Anoop Menon, Malavika Menon, Asif Ali and Meera Vasudev. The film is about maintaining the sanctity of familial relationships.




### Tokenization

In [7]:
class Preprocessor:
    def __init__(self, tokenizer) -> None:
        self.tokenizer = tokenizer
    def process_samples(self, samples):
        return samples.map(lambda sample: self.tokenizer(sample['text'], truncation=True))
        

In [8]:
model_repo = "microsoft/deberta-v3-xsmall" if is_train else "my model"
# may customize the classification head after import

tokenizer = AutoTokenizer.from_pretrained(model_repo)
p = Preprocessor(tokenizer)

if is_train:
    tokenize_train = p.process_samples(train_hf)
    tokenize_validation = p.process_samples(validation_hf)
    print(tokenizer.convert_ids_to_tokens(tokenize_validation[5]["input_ids"][:50]))

collector = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/5956 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 5956/5956 [00:02<00:00, 2487.75 examples/s]
Map: 100%|██████████| 288/288 [00:00<00:00, 2017.96 examples/s]

['[CLS]', '▁Aar', 'wang', 'en', '▁Castle', '▁(', 'German', ':', '▁Schloss', '▁Aar', 'wang', 'en', ')', '▁is', '▁a', '▁castle', '▁in', '▁the', '▁municipality', '▁of', '▁Aar', 'wang', 'en', '▁of', '▁the', '▁canton', '▁of', '▁Bern', '▁in', '▁Switzerland', '.', '▁It', '▁is', '▁a', '▁Swiss', '▁heritage', '▁site', '▁of', '▁national', '▁significance', '.', '▁I', '[SEP]']





## Model Selection

### Tested Models
We have tested major pretrained model using different features, for each one we have reported accuracy value
* google/mobilebert-uncased (wiki_desc  - 72%)
* microsoft/deberta-v3-xsmall (wiki_desc - 78%)
* microsoft/deberta-v3-base
* distilbert/distilbert-base-uncased-finetuned-sst-2-english (wiki_desc - 75%)
* microsoft/Multilingual-MiniLM-L12-H384
* distilbert/distilbert-base-uncased-distilled-squad
* distilbert/distilroberta-base

![](../imgs/deBERTa.jpg)

In [9]:
class CU_Classifier(nn.Module):

    def __init__(self, config: dict[str, int]):
        super().__init__()

        self.out = nn.Sequential(
            nn.Linear(
                in_features=config["dim_embedding"],
                out_features=config["num_classes"],
            ),
        )

        self.downsample = nn.Linear(in_features=config["dim_embedding"], out_features=config["dim_embedding"])


        self.fc = nn.Sequential(
            nn.Linear(
                in_features=config["dim_embedding"],
                out_features=config["hidden_layers"],
            ),
            nn.GELU(),
            nn.LayerNorm(config["hidden_layers"]),
            nn.Dropout(p=0.15),
            
            nn.Linear(
                in_features=config["hidden_layers"],
                out_features=config["dim_embedding"],
            ),
            nn.GELU(),
            nn.LayerNorm(config["dim_embedding"]),
            nn.Dropout(p=0.15),

        )
    def forward(self, X):
        identity = X.clone()

        X = self.fc(X)
        X = self.downsample(X) + identity
        X = self.out(X)
        return X


config = {"dim_embedding": 384, "hidden_layers": 512, "num_classes": 3}

In [10]:

def compute_metrics(eval_pred):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro")["f1"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="micro")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="micro")["recall"]

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [11]:

model = AutoModelForSequenceClassification.from_pretrained(
    model_repo, num_labels=3, ignore_mismatched_sizes=True)

model

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 384, padding_idx=0)
      (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=384, out_features=384, bias=True)
              (key_proj): Linear(in_features=384, out_features=384, bias=True)
              (value_proj): Linear(in_features=384, out_features=384, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): Layer

In [12]:
model.classifier = CU_Classifier(config)
model.dropout = nn.Dropout(0.15)


## Train and Evaluate the Network

### Training Phase (enabled if `is_train` is True)

In [13]:
# Parameters
epochs = 10

In [14]:
batch_size = 32
weight_decay = 0.01
learning_rate = 1.5e-5
out_dir = "CU_with_DBert"
log = "Cultural Analysis"

cls2label = {
    0: "Cultural Agnostic",
    1: "Cultural Rapresentative",
    2: "Cultural Exclusive",
}
label2cls = {l: c for c, l in cls2label.items()}

In [15]:
traning_args = TrainingArguments(
    output_dir=out_dir,
    eval_strategy="epoch",
    push_to_hub=False,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    warmup_steps=1000,
    weight_decay=weight_decay,
    learning_rate=learning_rate,
    report_to="none",
    logging_dir=log,

    adam_epsilon=1e-6,
    adam_beta1=0.9,
    adam_beta2=0.999,
    max_grad_norm=1.0 # gradient cliping
)

In [16]:
if is_train:
    trainer = Trainer(
        model,
        traning_args,
        collector,
        tokenize_train,
        tokenize_validation,
        tokenizer,
        compute_metrics=compute_metrics,
    )
    print(f"Model running on {trainer.model.device}")
    trainer.train()
    report = trainer.evaluate()
    print(report)

Model running on cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 170.00 MiB. GPU 0 has a total capacity of 11.62 GiB of which 149.00 MiB is free. Including non-PyTorch memory, this process has 11.18 GiB memory in use. Of the allocated memory 10.90 GiB is allocated by PyTorch, and 166.01 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Testing Phase

In [None]:
import torch
from torch.utils.data import DataLoader
from pandas import Series
from datasets import Dataset
from tqdm import tqdm

def predict_culture_pd(ds: Dataset, model: torch.nn.Module, tokenizer, device, max_length=128, batch_size=32) -> Series:
    model = model.to(device)
    model.eval()

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            return_tensors="pt",
            max_length=max_length,
            padding="max_length",
            truncation=True,
        )

    ds_tokenized = ds.map(tokenize, batched=True)
    ds_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

    dataloader = DataLoader(ds_tokenized, batch_size=batch_size)

    predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())

    return Series(predictions)


In [None]:

# Converts to tensors
tokenizer = AutoTokenizer.from_pretrained(
    model_repo
)



In [None]:
y_pred = predict_culture_pd(
    train_hf, model, tokenizer, ("cuda" if torch.cuda.is_available() else "cpu")
)

#####################################
# Save file for evaluation purposes #
#####################################

untest = pd.read_csv("test_unlabeled.csv")
untest.insert(loc=len(untest.columns), column="label", value=y_pred.map(cls2label))
untest.to_csv(
    "Many_Naps_Little_Progress_modello2.tsv", sep='\t',index=False
)

Map:   0%|          | 0/6248 [00:00<?, ? examples/s]

Map: 100%|██████████| 6248/6248 [00:00<00:00, 23447.30 examples/s]
100%|██████████| 196/196 [00:10<00:00, 17.84it/s]
