fine tunning model: https://huggingface.co/docs/transformers/training

testing model: https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

In [1]:
from utils.loader import DataLoader
import datasets


In [55]:
# Load datasets
loader = DataLoader()
train_data = loader.load_amazon(deceptive=False, all=True, test_mode=False)
test_data = loader.load_amazon(deceptive=False, all=True, test_mode=True)

In [56]:
df_train = datasets.Dataset.from_pandas(train_data)
remove_columns = ['DOC_ID', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY', 'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE']
for col in remove_columns:
    df_train = df_train.remove_columns(col) 

df_test = datasets.Dataset.from_pandas(test_data)
remove_columns = ['DOC_ID', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY', 'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE']
for col in remove_columns:
    df_test = df_test.remove_columns(col) 


In [57]:
print(df_train, df_test)

Dataset({
    features: ['LABEL', 'REVIEW_TEXT'],
    num_rows: 15750
}) Dataset({
    features: ['LABEL', 'REVIEW_TEXT'],
    num_rows: 5250
})


In [59]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["REVIEW_TEXT"], padding="max_length", truncation=True)

tokenized_train = df_train.map(tokenize_function, batched=True)
tokenized_test = df_test.map(tokenize_function, batched=True)

100%|██████████| 16/16 [00:02<00:00,  6.07ba/s]
100%|██████████| 6/6 [00:00<00:00,  7.33ba/s]


In [60]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading: 100%|██████████| 416M/416M [00:16<00:00, 26.8MB/s] 
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceC

In [61]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [63]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

In [64]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [69]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [70]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()