In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
from datasets import Dataset
#import evaluate
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [11]:
def prepare_data(df):
    """
    Prepara os dados para treinamento, incluindo encoding das labels
    e criação dos datasets.
    """
    label_encoder = LabelEncoder()
    df['category_encoded'] = label_encoder.fit_transform(df['rating'])

    df['combined_text'] = df['description'] + ' [SEP] ' + df['variety']

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['combined_text'].values,
        df['category_encoded'].values,
        test_size=0.3,
        random_state=42
    )
    return train_texts, val_texts, train_labels, val_labels, label_encoder

In [13]:
def create_dataset(texts, labels, tokenizer):
    """
    Cria dataset no formato adequado para o transformer
    """
    # Tokenização dos textos
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )

    # Criar dataset
    dataset = Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    })

    return dataset

def compute_metrics(eval_pred):
    """
    Calcula métricas de avaliação
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

In [14]:
class WineReviewClassifier:
    def __init__(self, num_labels):
        self.model_name = "distilbert-base-uncased"  # Modelo base pré-treinado
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=num_labels
        )

    def train(self, train_dataset, val_dataset, epochs=3):
        """
        Treina o modelo usando o Trainer do Hugging Face
        """
        training_args = TrainingArguments(
            output_dir="./wine_classifier",
            num_train_epochs=epochs,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=100,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="accuracy"
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        trainer.train()
        return trainer

    def predict(self, texts):
        """
        Realiza predições em novos textos
        """
        encodings = self.tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors='pt'
        )

        with torch.no_grad():
            outputs = self.model(**encodings)
            predictions = torch.argmax(outputs.logits, dim=1)

        return predictions.numpy()

In [15]:
def main():
    df = pd.read_csv("wine-data.csv")

    train_texts, val_texts, train_labels, val_labels, label_encoder = prepare_data(df)

    classifier = WineReviewClassifier(num_labels=len(label_encoder.classes_))

    train_dataset = create_dataset(train_texts, train_labels, classifier.tokenizer)
    val_dataset = create_dataset(val_texts, val_labels, classifier.tokenizer)

    trainer = classifier.train(train_dataset, val_dataset)

    # # Avaliar modelo
    # eval_results = trainer.evaluate()
    # print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
    # 
    # # Fazer predições
    # new_texts = ["Exemplo de nova review de vinho"]
    # predictions = classifier.predict(new_texts)
    # predicted_categories = label_encoder.inverse_transform(predictions)
    # print(f"Categorias preditas: {predicted_categories}")

if __name__ == "__main__":
    main()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 5.61 GiB of which 3.00 MiB is free. Process 15862 has 4.94 GiB memory in use. Including non-PyTorch memory, this process has 668.00 MiB memory in use. Of the allocated memory 537.31 MiB is allocated by PyTorch, and 26.69 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)