In [1]:
%%capture
# kaggle workspace command


!pip install transformers datasets evaluate accelerate

In [2]:
import os
import numpy as np 
import pandas as pd

import evaluate

from datasets import load_dataset
from transformers import (AutoModelForSequenceClassification,
                          TrainingArguments,
                          Trainer,
                          AutoConfig,
                          AutoTokenizer,
                          DataCollatorWithPadding,
                          pipeline)

2024-07-06 15:23:59.253878: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-06 15:23:59.254028: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-06 15:23:59.392868: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load and preprocess dataset

In [3]:
ds = load_dataset("fancyzhx/amazon_polarity")

ds

Downloading readme:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/260M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/258M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/254M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 400000
    })
})

In [4]:
def make_title_and_content_strings(ds):
    title_and_content = ds['title']

    for i, content in enumerate(ds['content']):
        title_and_content[i] += ' ' + content
    
    return title_and_content

In [5]:
ds['train'] = ds['train'].take(300_000)
ds['test'] = ds['test'].take(60_000)


train_texts = make_title_and_content_strings(ds['train'])
test_texts = make_title_and_content_strings(ds['test'])

ds['train'] = ds['train'].add_column("title_and_content", train_texts)
ds['test'] = ds['test'].add_column("title_and_content", test_texts)

# Initialize and train model

In [6]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    return tokenizer(examples['title_and_content'], truncation=True)


tokenized_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
# accuracy = evaluate.load("accuracy")
clf_metrics = evaluate.combine(["accuracy", "f1"])

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return clf_metrics.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [10]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


config = AutoConfig.from_pretrained('google-bert/bert-base-multilingual-cased', label2id=label2id, id2label=id2label)

model = AutoModelForSequenceClassification.from_pretrained(
    'google-bert/bert-base-multilingual-cased', config=config
)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir="sentiment_analysis_reviews_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1637,0.155077,0.948683,0.948754
2,0.117,0.172913,0.955433,0.955813


TrainOutput(global_step=37500, training_loss=0.1604586021931966, metrics={'train_runtime': 16177.8119, 'train_samples_per_second': 37.088, 'train_steps_per_second': 2.318, 'total_flos': 7.121401646587968e+16, 'train_loss': 0.1604586021931966, 'epoch': 2.0})

# Inference

In [12]:
second_epoch_checkpoint = '/kaggle/working/sentiment_analysis_reviews_model/' + os.listdir('/kaggle/working/sentiment_analysis_reviews_model')[1]
    
second_epoch_checkpoint

'/kaggle/working/sentiment_analysis_reviews_model/checkpoint-37500'

In [13]:
classifier = pipeline("sentiment-analysis", model=second_epoch_checkpoint)

In [14]:
text = 'bom produto, mas não estou satisfeito com o serviço'
text_eng = "good product, but I'm not satisfied with the servic"
text2 = 'nunca vi esse produto'
text2_eng = "I've never seen this product"


print(classifier(text))
print(classifier(text_eng))

print(classifier(text2))
print(classifier(text2_eng))

[{'label': 'NEGATIVE', 'score': 0.9000328779220581}]
[{'label': 'POSITIVE', 'score': 0.9041880369186401}]
[{'label': 'NEGATIVE', 'score': 0.9897034764289856}]
[{'label': 'POSITIVE', 'score': 0.9231755137443542}]


In [15]:
print(classifier('Recebi bem antes do prazo estipulado.'))
print(classifier('I received it well before the stipulated deadline.'))

[{'label': 'NEGATIVE', 'score': 0.9816004037857056}]
[{'label': 'NEGATIVE', 'score': 0.9715514183044434}]


In [16]:
print(classifier('fixe'))
print(classifier('cool'))

print(classifier('loja fixe'))
print(classifier('cool shop'))

[{'label': 'POSITIVE', 'score': 0.9833613038063049}]
[{'label': 'POSITIVE', 'score': 0.996070146560669}]
[{'label': 'POSITIVE', 'score': 0.9562061429023743}]
[{'label': 'POSITIVE', 'score': 0.9935418367385864}]


In [17]:
print(classifier('tienda genial'))
print(classifier('boutique sympa'))

[{'label': 'POSITIVE', 'score': 0.9948840737342834}]
[{'label': 'NEGATIVE', 'score': 0.9042856693267822}]


In [18]:
print(classifier('loja legal'))
print(classifier('loja fresco'))

[{'label': 'POSITIVE', 'score': 0.9205412268638611}]
[{'label': 'POSITIVE', 'score': 0.9700451493263245}]
