In [1]:
!pip install transformers datasets scikit-learn

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [8]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
import numpy as np
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import load_metric

In [10]:
# dataset
data = pd.read_csv('/content/greetings_dataset.csv')
dataset = Dataset.from_pandas(data)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenizar
def tokenize_function(examples):
    return tokenizer(examples['texto'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

labels_to_ids = {'greeting': 0, 'none': 1}
tokenized_datasets = tokenized_datasets.map(lambda x: {'label': labels_to_ids[x['label']]})

id2label = {0: 'greeting', 1: 'not_greeting'}
label2id = {v: k for k, v in id2label.items()}

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
# BERT
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.config.id2label = id2label
model.config.label2id = label2id

tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

# 🦾🦾
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.036895,1.0
2,No log,0.003338,1.0
3,No log,0.002393,1.0


TrainOutput(global_step=150, training_loss=0.10691238403320312, metrics={'train_runtime': 55.9677, 'train_samples_per_second': 42.882, 'train_steps_per_second': 2.68, 'total_flos': 631466532864000.0, 'train_loss': 0.10691238403320312, 'epoch': 3.0})

In [12]:
results = trainer.evaluate()
print(f"Accuracy: {results['eval_accuracy']}")

Accuracy: 1.0


In [14]:
model.save_pretrained("HelloBERT")
tokenizer.save_pretrained("HelloBERT")

('HelloBERT/tokenizer_config.json',
 'HelloBERT/special_tokens_map.json',
 'HelloBERT/vocab.txt',
 'HelloBERT/added_tokens.json',
 'HelloBERT/tokenizer.json')

In [16]:
model = AutoModelForSequenceClassification.from_pretrained("HelloBERT")
tokenizer = AutoTokenizer.from_pretrained("HelloBERT")

# Crear el clasificador usando el modelo cargado y la GPU
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)

# Textos para clasificación
texts = [
    "Hello, how are you?",
    "Hola, ¿cómo estás?",
    "What's the weather like today?",
    "Buenos días, ¿cómo te encuentras?",
    "I need help with my homework.",
    "Qué haces panita? Cómo va?",
    "Buenardo, buen día, coscu",
    "heyyyy",
    "buenas buenas",
    "helloooooo",
    "hi",
    "whats uuuuppp????"
]

# Clasificar cada texto y obtener etiquetas directamente
for text in texts:
    result = classifier(text)
    label = result[0]['label']
    print(f"Text: {text} => Label: {label}, Score: {result[0]['score']}")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Text: Hello, how are you? => Label: greeting, Score: 0.9980661273002625
Text: Hola, ¿cómo estás? => Label: greeting, Score: 0.9977893829345703
Text: What's the weather like today? => Label: not_greeting, Score: 0.9933899641036987
Text: Buenos días, ¿cómo te encuentras? => Label: greeting, Score: 0.99715256690979
Text: I need help with my homework. => Label: not_greeting, Score: 0.9958699345588684
Text: Qué haces panita? Cómo va? => Label: greeting, Score: 0.9890562295913696
Text: Buenardo, buen día, coscu => Label: greeting, Score: 0.9166218638420105
Text: heyyyy => Label: greeting, Score: 0.9971520900726318
Text: buenas buenas => Label: greeting, Score: 0.997988224029541
Text: helloooooo => Label: greeting, Score: 0.9977092742919922
Text: hi => Label: greeting, Score: 0.9982302784919739
Text: whats uuuuppp???? => Label: greeting, Score: 0.9894077181816101
