## Procesado de Datos

###Descarga del DataSet

In [1]:
%%capture
!pip install datasets transformers evaluate

### Importamos el datset

In [2]:
from datasets import load_dataset

ds = load_dataset('glue',"mrpc")



  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
example = ds ['train'][400]
example

{'sentence1': 'U.S. Agriculture Secretary Ann Veneman , who announced Tuesdays ban , also said Washington would send a technical team to Canada to help .',
 'sentence2': "U.S. Agriculture Secretary Ann Veneman , who announced yesterday 's ban , also said Washington would send a technical team to Canada to assist in the Canadian situation .",
 'label': 1,
 'idx': 446}

In [4]:
example = ds ['train'][100]
example

{'sentence1': 'The Nasdaq composite index inched up 1.28 , or 0.1 percent , to 1,766.60 , following a weekly win of 3.7 percent .',
 'sentence2': 'The technology-laced Nasdaq Composite Index .IXIC was off 24.44 points , or 1.39 percent , at 1,739.87 .',
 'label': 0,
 'idx': 114}

In [5]:
# vista de labels
labels = ds["train"].features["label"]

In [6]:
labels.int2str(1)

'equivalent'

In [7]:
labels.int2str(0)

'not_equivalent'

## Tokenizador

In [8]:
from transformers import AutoTokenizer

repo_id = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(repo_id)

In [9]:
ds['train']['sentence1'][1]

"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion ."

In [10]:
tokenized_sentence_1 = tokenizer(ds["train"]["sentence1"][1])
tokenized_sentence_1

{'input_ids': [101, 9805, 3540, 11514, 2050, 3079, 11282, 2243, 1005, 1055, 2077, 4855, 1996, 4677, 2000, 3647, 4576, 1999, 2687, 2005, 1002, 1016, 1012, 1019, 4551, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
inputs  = tokenizer("This is the first","This is the second")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 102, 2023, 2003, 1996, 2117, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
#hacemos el procesos inverso convirtiendo los numeros a texto
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 '[SEP]']

### Tokenizador con otro modelo

In [13]:
repo_id="distilroberta-base"

tokeniker  =  AutoTokenizer.from_pretrained(repo_id)

In [14]:
def tokenize_fn(example):
    return tokeniker(example['sentence1'],example['sentence2'],truncation=True)

### preparacion del dataset

In [15]:
prepared_ds = ds.map(tokenize_fn,batched=True)



Map:   0%|          | 0/408 [00:00<?, ? examples/s]



In [16]:
prepared_ds

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1725
    })
})

### Definiendo el datacollator: con un padding dinamico

In [17]:
##usaremos la tecnica del padding por lotes para tener un dataset Cuadrado
from transformers import DataCollatorWithPadding

data_collactor =DataCollatorWithPadding(tokenizer = tokenizer)

In [18]:
data_collactor

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

## Entrenamiento y evaluacion

### Metricas

In [19]:
import evaluate
import numpy as np



def compute_metrics(eval_pred):
    metric = evaluate.load('glue',"mrpc")
    logits, labels = eval_pred
    predictions = np.argmax(logits , axis=-1)
    return metric.compute(predictions = predictions,references=labels)


### Configuracion del Trainer

In [20]:
from transformers import AutoModelForSequenceClassification

labels = ds['train'].features["label"].names

In [21]:
labels

['not_equivalent', 'equivalent']

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(
    repo_id,
    num_labels = len (labels),
    id2label = {str(i):c for i,c in enumerate(labels)},
    label2id = {c:str(i)for i,c in enumerate(labels) }

)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bia

In [23]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

### Definimos nuestros Argumentos de entrenamiento

In [24]:
%%capture
!pip install transformers==4.29.0
!pip install git+https://github.com/huggingface/accelerate

In [25]:
from transformers import TrainingArguments

training_args= TrainingArguments(
    output_dir ="./distilroberta-base-mrpc-glue-jhon-ramirez",
    evaluation_strategy = "steps",
    num_train_epochs= 3 ,
    push_to_hub = True,
    load_best_model_at_end=True
)

In [26]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [27]:
from transformers import Trainer


In [28]:
print(Trainer)


<class 'transformers.trainer.Trainer'>


In [29]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["validation"],
    data_collator=data_collactor,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/newmanbb/distilroberta-base-mrpc-glue-jhon-ramirez into local empty directory.


Download file pytorch_model.bin:   0%|          | 16.5k/313M [00:00<?, ?B/s]

Download file runs/Jun22_20-12-03_ba3329219061/1687466169.966188/events.out.tfevents.1687466169.ba3329219061.2…

Download file runs/Jun22_20-12-03_ba3329219061/1687466293.9761066/events.out.tfevents.1687466293.ba3329219061.…

Download file runs/Jun22_20-42-25_ba3329219061/1687466578.0001817/events.out.tfevents.1687466578.ba3329219061.…

Download file runs/Jun22_20-12-03_ba3329219061/1687466378.5618043/events.out.tfevents.1687466378.ba3329219061.…

Download file runs/Jun22_20-12-03_ba3329219061/1687465988.9502335/events.out.tfevents.1687465988.ba3329219061.…

Download file runs/Jun22_20-12-03_ba3329219061/events.out.tfevents.1687465988.ba3329219061.2076.0: 100%|######…

Download file runs/Jun22_20-42-25_ba3329219061/events.out.tfevents.1687466577.ba3329219061.10063.0: 100%|#####…

Clean file runs/Jun22_20-12-03_ba3329219061/1687466169.966188/events.out.tfevents.1687466169.ba3329219061.2076…

Clean file runs/Jun22_20-12-03_ba3329219061/1687465988.9502335/events.out.tfevents.1687465988.ba3329219061.207…

Clean file runs/Jun22_20-12-03_ba3329219061/1687466378.5618043/events.out.tfevents.1687466378.ba3329219061.207…

Clean file runs/Jun22_20-12-03_ba3329219061/events.out.tfevents.1687465988.ba3329219061.2076.0:   6%|5        …

Clean file runs/Jun22_20-12-03_ba3329219061/1687466293.9761066/events.out.tfevents.1687466293.ba3329219061.207…

Download file runs/Jun23_02-57-49_0475471308bd/1687489095.8901136/events.out.tfevents.1687489095.0475471308bd.…

Clean file runs/Jun22_20-42-25_ba3329219061/1687466578.0001817/events.out.tfevents.1687466578.ba3329219061.100…

Download file training_args.bin: 100%|##########| 3.87k/3.87k [00:00<?, ?B/s]

Download file runs/Jun23_02-57-49_0475471308bd/events.out.tfevents.1687489095.0475471308bd.15143.0: 100%|#####…

Download file runs/Jun23_02-12-48_0475471308bd/events.out.tfevents.1687487183.0475471308bd.2985.2: 100%|######…

Clean file runs/Jun22_20-42-25_ba3329219061/events.out.tfevents.1687466577.ba3329219061.10063.0:  18%|#8      …

Download file runs/Jun23_02-12-48_0475471308bd/events.out.tfevents.1687486978.0475471308bd.2985.0: 100%|######…

Clean file runs/Jun23_02-57-49_0475471308bd/1687489095.8901136/events.out.tfevents.1687489095.0475471308bd.151…

Download file runs/Jun23_02-12-48_0475471308bd/1687486978.599051/events.out.tfevents.1687486978.0475471308bd.2…

Clean file training_args.bin:  26%|##5       | 1.00k/3.87k [00:00<?, ?B/s]

Clean file runs/Jun23_02-57-49_0475471308bd/events.out.tfevents.1687489095.0475471308bd.15143.0:  18%|#8      …

Clean file runs/Jun23_02-12-48_0475471308bd/events.out.tfevents.1687487183.0475471308bd.2985.2: 100%|#########…

Clean file runs/Jun23_02-12-48_0475471308bd/events.out.tfevents.1687486978.0475471308bd.2985.0:  18%|#8       …

Clean file runs/Jun23_02-12-48_0475471308bd/1687486978.599051/events.out.tfevents.1687486978.0475471308bd.2985…

Download file runs/Jun23_02-57-49_0475471308bd/events.out.tfevents.1687489311.0475471308bd.15143.2: 100%|#####…

Clean file runs/Jun23_02-57-49_0475471308bd/events.out.tfevents.1687489311.0475471308bd.15143.2: 100%|########…

Clean file pytorch_model.bin:   0%|          | 1.00k/313M [00:00<?, ?B/s]

### Entrenamiento

In [30]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train",train_results.metrics)
trainer.save_metrics("train",train_results.metrics)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
500,0.5026,0.678408,0.808824,0.86121
1000,0.3307,0.561845,0.85049,0.891266


Downloading builder script: 0.00B [00:00, ?B/s]

To https://huggingface.co/newmanbb/distilroberta-base-mrpc-glue-jhon-ramirez
   61b3dd7..e197c50  main -> main

   61b3dd7..e197c50  main -> main

To https://huggingface.co/newmanbb/distilroberta-base-mrpc-glue-jhon-ramirez
   e197c50..8675c27  main -> main

   e197c50..8675c27  main -> main



***** train metrics *****
  epoch                    =        3.0
  total_flos               =   191920GF
  train_loss               =     0.3504
  train_runtime            = 0:02:39.36
  train_samples_per_second =     69.048
  train_steps_per_second   =       8.64


In [31]:
metrics = trainer.evaluate(prepared_ds["validation"])
trainer.log_metrics("eval",metrics)
trainer.save_metrics("eval",metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.8505
  eval_f1                 =     0.8913
  eval_loss               =     0.5618
  eval_runtime            = 0:00:01.29
  eval_samples_per_second =    314.543
  eval_steps_per_second   =     39.318


To https://huggingface.co/newmanbb/distilroberta-base-mrpc-glue-jhon-ramirez
   8675c27..d544c65  main -> main

   8675c27..d544c65  main -> main

To https://huggingface.co/newmanbb/distilroberta-base-mrpc-glue-jhon-ramirez
   d544c65..1ad6bc3  main -> main

   d544c65..1ad6bc3  main -> main



'https://huggingface.co/newmanbb/distilroberta-base-mrpc-glue-jhon-ramirez/commit/d544c650e79fd5cb1627058bf39a44adf17deee1'