In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import classification_report

In [None]:
# loading dataset IMDb to apply the fine tunning.
dataset = load_dataset("imdb")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Function to tokenize texts
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# tokenizing
encoded_dataset = dataset.map(tokenize_function, batched=True)
encoded_dataset = encoded_dataset.remove_columns(["text"])
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
#Validating if the GPU is bein recognized to be used in train. 
print(torch.cuda.is_available()) 
print(torch.cuda.get_device_name(0)) 

True
NVIDIA GeForce MX550


In [5]:
# Download pre treined model distilBERT
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model = model.to("cuda")  # Used to force the train to use the GPU memory

# train parameters
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Defining the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
)

# Training the model
trainer.train()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.2845,0.28637
2,0.1848,0.280992
3,0.0848,0.339866


TrainOutput(global_step=12501, training_loss=0.198021726363257, metrics={'train_runtime': 60551.8327, 'train_samples_per_second': 1.239, 'train_steps_per_second': 0.206, 'total_flos': 9935054899200000.0, 'train_loss': 0.198021726363257, 'epoch': 3.0})

In [6]:
# Make predictions on the test dataset.
predictions = trainer.predict(encoded_dataset["test"])
y_pred = predictions.predictions.argmax(axis=-1)
y_true = predictions.label_ids

# Using the classification report to see the metrics from the model.
print(classification_report(y_true, y_pred, target_names=["Negativo", "Positivo"]))

              precision    recall  f1-score   support

    Negativo       0.94      0.92      0.93     12500
    Positivo       0.92      0.94      0.93     12500

    accuracy                           0.93     25000
   macro avg       0.93      0.93      0.93     25000
weighted avg       0.93      0.93      0.93     25000



In [5]:
# Recovering the model just to check
# checkpoint_path = "models/checkpoint-12501"
checkpoint_path = 'olucas-carvalho/sentimental-analysis-BERT'

# Loading model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

In [6]:
# Using the cpu to make predictions on new sentences.
device = torch.device("cpu")
model.to(device)

# Sentence to be tested.
text = "Today is a beautiful day, and I feel incredibly grateful for everything in my life."

inputs = tokenizer(text, return_tensors="pt")  # Retorna um tensor PyTorch
inputs = {key: value.to(device) for key, value in inputs.items()}

# Making the prediction
with torch.no_grad():
    outputs = model(**inputs)

# Predicting the class.
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

print(f"Classe prevista: {predicted_class}")

Classe prevista: 1
