fine tunning model: https://huggingface.co/docs/transformers/training

testing model: https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

In [58]:
from utils.loader import DataLoader
import datasets
import random


In [45]:
# Load datasets
loader = DataLoader()
train_data = loader.load_amazon(deceptive=False, all=True, test_mode=False)
test_data = loader.load_amazon(deceptive=False, all=True, test_mode=True)

In [71]:
# choose 3 categories to train with
unique_category = train_data['PRODUCT_CATEGORY'].unique()
random.seed(22)
print(unique_category[random.sample(range(30), 3)])
# gogo = train_data[train_data['PRODUCT_CATEGORY'].isin(['Jewelry' 'Outdoors'])]


['Beauty' 'Office Products' 'Grocery']


In [84]:
def df_2_dataset(panda_data, VERIFIED_PURCHASE, PRODUCT_CATEGORY=False):
    if PRODUCT_CATEGORY != False:
        panda_data = panda_data[panda_data['PRODUCT_CATEGORY'].isin(PRODUCT_CATEGORY)]

    panda_data = panda_data[panda_data['VERIFIED_PURCHASE'].isin(VERIFIED_PURCHASE)]

    dataset_data = datasets.Dataset.from_pandas(panda_data)
    remove_columns = ['DOC_ID',
                    'RATING',
                    'VERIFIED_PURCHASE',
                    'PRODUCT_CATEGORY',
                    'PRODUCT_ID',
                    'PRODUCT_TITLE',
                    'REVIEW_TITLE',
                    '__index_level_0__']
    for col in remove_columns:
        dataset_data = dataset_data.remove_columns(col) 
    dataset_data = dataset_data.rename_column('LABEL', 'labels')
    dataset_data = dataset_data.rename_column('REVIEW_TEXT', 'text')

    return dataset_data

verified = ['Y']
category = ['Grocery']
df_train = df_2_dataset(train_data, verified, category)
df_test = df_2_dataset(test_data, verified, category)
df_full_test = df_2_dataset(test_data, verified, False)



In [85]:
print(df_train, df_test, df_full_test)

Dataset({
    features: ['labels', 'text'],
    num_rows: 233
}) Dataset({
    features: ['labels', 'text'],
    num_rows: 78
}) Dataset({
    features: ['labels', 'text'],
    num_rows: 2866
})


In [75]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# train and val sets
tokenized_train = df_train.map(tokenize_function, batched=True)
tokenized_train = tokenized_train.train_test_split(test_size=0.1)

# test set
tokenized_test = df_test.map(tokenize_function, batched=True)
tokenized_full_test = df_full_test.map(tokenize_function, batched=True)

100%|██████████| 1/1 [00:00<00:00,  4.78ba/s]
100%|██████████| 1/1 [00:00<00:00, 24.88ba/s]
100%|██████████| 6/6 [00:00<00:00,  7.05ba/s]


In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [16]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [18]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", 
                                  # evaluation_strategy="steps",
                                  evaluation_strategy="epoch",
                                  num_train_epochs=5,
                                  save_total_limit = 1,
                                  load_best_model_at_end=True,
                                  save_strategy = "epoch")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [76]:
# reduce data size for debugging
# with 100 training data we get ~60%
testing = False

if testing:
  train_dataset = tokenized_train['train'].shuffle(seed=42).select(range(200))
  val_dataset = tokenized_train['test'].shuffle(seed=42).select(range(100))
  test_dataset = tokenized_test.shuffle(seed=42).select(range(200))
  test_full_dataset = tokenized_full_test.shuffle(seed=42).select(range(200))
else:
  train_dataset = tokenized_train['train']
  val_dataset = tokenized_train['test']
  test_dataset = tokenized_test
  test_full_dataset = tokenized_full_test

print(train_dataset, val_dataset, test_dataset,tokenized_full_test)

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'text', 'token_type_ids'],
    num_rows: 480
}) Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'text', 'token_type_ids'],
    num_rows: 54
}) Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'text', 'token_type_ids'],
    num_rows: 166
}) Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'text', 'token_type_ids'],
    num_rows: 5250
})


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
from datasets import load_metric
metric = load_metric("glue", "mrpc")

In [None]:
for d in [test_dataset, test_full_dataset]:
    predictions = trainer.predict(d)
    print(predictions.predictions.shape, predictions.label_ids.shape)

    preds = np.argmax(predictions.predictions, axis=-1)
    metric.compute(predictions=preds, references=predictions.label_ids)

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/2022-03 SNLP group project/tuned_classifier'
trainer.save_model(path)

In [None]:
new_model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=2)

test_trainer = Trainer(new_model)

ppp = test_trainer.predict(test_dataset)
print(ppp.predictions.shape, ppp.label_ids.shape)

pd = np.argmax(ppp.predictions, axis=-1)

In [None]:
metric = load_metric("glue", "mrpc")
metric.compute(predictions=pd, references=ppp.label_ids)