<a href="https://colab.research.google.com/github/miguelcimat/Training-Spanish-Text/blob/main/TrainingSpanishTextBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification with Transformers

### Setting

In [None]:
!pip install datasets
!pip install evaluate
!pip install transformers==4.28.0

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any.

In [None]:
!apt install git-lfs
!git config --global user.email "helena.adorno@gmail.com"
!git config --global user.name "helenpy"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


### Load and explore data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value

# Load training and validation data from files
train_data_path = '/content/drive/MyDrive/Meia2023/Modulo2-ClasificacionTextos/notebooks/corpusTASS-2020/train.tsv'
dev_data_path = '/content/drive/MyDrive/Meia2023/Modulo2-ClasificacionTextos/notebooks/corpusTASS-2020/dev.tsv'

train_data = pd.read_csv(train_data_path, sep='\t')
dev_data = pd.read_csv(dev_data_path, sep='\t')

# Remove unnecessary columns from the datasets
train_data = train_data.drop(columns=['id', 'pais'])
dev_data = dev_data.drop(columns=['id', 'pais'])

# Mapping dictionary for labels
label_mapping = {'N': 0, 'NEU': 1, 'P': 2}

# Define the structure of the dataset for Hugging Face
features = Features({
    'texto': Value('string'),
    'etiqueta': ClassLabel(num_classes=3, names=['P', 'N', 'NEU'])
})

# Convert DataFrame to Dataset
dataset_train = Dataset.from_pandas(train_data, features=features)
dataset_dev = Dataset.from_pandas(dev_data, features=features)

# Rename columns
dataset_train = dataset_train.rename_column("texto", "text")
dataset_train = dataset_train.rename_column("etiqueta", "label")
dataset_dev = dataset_dev.rename_column("texto", "text")
dataset_dev = dataset_dev.rename_column("etiqueta", "label")

# Create a dictionary of datasets
datasets = DatasetDict({'train': dataset_train, 'test': dataset_dev})

datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4802
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2443
    })
})

In [None]:
import random
import pandas as pd
from datasets import ClassLabel
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    "Taken from https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb"

    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

show_random_elements(datasets['train'])

Unnamed: 0,text,label
0,@Jenyeerimenta Yo a él lo amaba desde hace 3 años y ya es demasiado común jaja,N
1,"@Jhovanny920218 la vdd si, jaja pero yo me curo rápido de eso. Ya la perdone. Gracias!!",P
2,Mi drama es que ahora me sale primero @taracanto en vez de @/BlasCanto y me confundo bc costumbre de dar al primer user,N
3,@jdclarke y que el mae no sea tan desastroso como aparentó durante la campaña. Hope is all we have left,N
4,"@M1ster__ thanks weapon bonico,♡ sí, ya estoy en el aeropuerto y fíjate tú ni me da pena. Anoche el calor fue HORROROSO no he dormido ná.",NEU
5,"Parada obligada en pits y al banco.. Rumbo a Pueblo frío.. Hay q ir por la patrona, se acabaron las vacaciones",N
6,Paaa jona te extraño primo firme,N
7,@rickymanjarrez miraré el programa completo y ya esta gracias me encanto lo que escribiste sobre marc y jlo en la foto q RT Gracias!!,P
8,"Te bloquee Crea cuentas huevo y solito se sigue y dá RT, porque ya no le creen sus fans. Ah qué mi paisano, ni me interesa su amistad.",N
9,mi hermoso @jaime_cruzroman esta que hace live en ig y no puedo ver me quiero morir queria un saludo para las @Cruzers_Peru,NEU


In [None]:
datasets.set_format("pandas")
df = datasets['train'][:]
df.head()

Unnamed: 0,text,label
0,@morbosaborealis jajajaja... eso es verdad... ...,1
1,@Adriansoler espero y deseo que el interior te...,2
2,"comprendo que te molen mis tattoos, pero no te...",2
3,"Mi última partida jugada, con Sona support. La...",0
4,Tranquilos que con el.dinero de Camacho seguro...,0


In [None]:
datasets.reset_format()

In [None]:
show_random_elements(datasets['train'], num_examples=3)

Unnamed: 0,text,label
0,He salido de casa sin ir al baño y me estoy meando como una persona mayor,N
1,"El año pasado lloré cuando Shawn dijo que vendría y yo no pude ir y soy consciente de que este año tampoco va a poder ser, quiero llorar",N
2,"No puedo creer que en unas horas sere egresada!!! 21 años,2 carreras,estudios en europa,excelente trabajo Gracias Dios!",P


### Load Roberta's model to work with the Spanish language

In [None]:
from transformers import AutoTokenizer

#model_checkpoint = "vg055/roberta-base-bne-finetuned-Tass2020"
model_checkpoint = "distilbert-base-multilingual-cased"#distilbert-base-multilingual-cased #vg055/roberta-base-bne-finetuned-Tass2020
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer.vocab_size

119547

In [None]:
text = "¡hola, estamos muy felices practicando la tokenizacion!"
tokenized_text = tokenizer.encode(text)

for token in tokenized_text:
    print(token, tokenizer.decode([token]))

101 [CLS]
199 ¡
110516 hol
10113 ##a
117 ,
11504 esta
13386 ##mos
13436 muy
13077 fel
39801 ##ices
56309 prac
13640 ##tica
10605 ##ndo
10109 la
18436 tok
18687 ##eni
104679 ##zaci
10263 ##on
106 !
102 [SEP]


In [None]:
encoded_text = tokenizer(text, return_tensors="pt")
encoded_text

{'input_ids': tensor([[   101,    199, 110516,  10113,    117,  11504,  13386,  13436,  13077,
          39801,  56309,  13640,  10605,  10109,  18436,  18687, 104679,  10263,
            106,    102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
def tokenize_reviews(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
columns = datasets['train'].column_names
columns.remove("label")
encoded_dataset = datasets.map(tokenize_reviews, batched=True, remove_columns=columns)
encoded_dataset

Map:   0%|          | 0/4802 [00:00<?, ? examples/s]

Map:   0%|          | 0/2443 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 4802
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2443
    })
})

In [None]:
encoded_dataset['train'][0]

{'label': 1,
 'input_ids': [101,
  137,
  24984,
  19804,
  107956,
  33269,
  10201,
  10320,
  10320,
  10320,
  119,
  119,
  119,
  36584,
  10196,
  79381,
  119,
  119,
  119,
  36579,
  10192,
  13605,
  11381,
  10854,
  13819,
  10133,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

### Load the pretrained model

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weigh

In [None]:
outputs = model(**encoded_text)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0525, -0.1443,  0.0373]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Define performance metrics

In [None]:
import evaluate

metric = evaluate.load("accuracy")
metric

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

### Tune the pretrained model

In [None]:
from transformers import TrainingArguments

model_name = model_checkpoint.split("/")[-1]

batch_size = 12
num_train_epochs=3
train_dataset = encoded_dataset["train"]#.shuffle(seed=42).select(range(num_train_samples))
logging_steps = len(train_dataset) // (2 * batch_size * num_train_epochs)

training_args = TrainingArguments(
    output_dir="results-meia_2",
    num_train_epochs=num_train_epochs,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=logging_steps,
    #push_to_hub=True,
    #push_to_hub_model_id=f"{model_name}-finetuned-tass"
)

In [None]:
from transformers import Trainer

test_dataset = encoded_dataset["test"]

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9268,0.922343,0.555465
2,0.7507,0.92511,0.580434
3,0.6203,0.972853,0.585755


TrainOutput(global_step=1203, training_loss=0.798217174120972, metrics={'train_runtime': 170.9647, 'train_samples_per_second': 84.263, 'train_steps_per_second': 7.037, 'total_flos': 168645907363644.0, 'train_loss': 0.798217174120972, 'epoch': 3.0})