In [31]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer, Trainer, TrainingArguments, ElectraTokenizer, ElectraForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import torch
from accelerate import Accelerator
import os


In [32]:
model_name = 'vinai/bertweet-base'

In [33]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device=device)
print(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSe

cuda


In [39]:
training_args = TrainingArguments(
            output_dir='./results',
            do_eval=True,
            do_train=True,
            num_train_epochs=4,
            save_total_limit=4,
            load_best_model_at_end=True,
            learning_rate=1e-04,
            per_device_train_batch_size=64,
            per_device_eval_batch_size=64,
            save_strategy="steps",
            logging_strategy="steps",
            evaluation_strategy="steps",
            logging_steps=1000,
            eval_steps=1000,
            save_steps=1000,
            optim="adamw_hf",
        )

def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(
                labels, preds, average="weighted"
            )
            acc = accuracy_score(labels, preds)
            return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [35]:
pos = pd.read_csv('data/train_pos_full.txt', delimiter='\t', header=None).sample(200000, random_state=2)
pos['label'] = 1
neg = pd.read_csv('data/train_neg_full.txt', delimiter='\t', header=None).sample(200000, random_state=2)
neg['label'] = -1

train_df = pd.concat([pos, neg]).sample(frac=1, random_state=42)
dev_df = train_df.sample(frac=0.2, random_state=42)
train_df = train_df.drop(dev_df.index)
test_df = pd.read_csv('data/test_data.txt', delimiter='\t', header=None)

In [36]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])
train_df['label'] = label_encoder.transform(train_df['label'])
dev_df['label'] = label_encoder.transform(dev_df['label'])

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(dev_df),
    #'test': Dataset.from_pandas(test_df)
})

  return cls(pa.Table.from_pandas(*args, **kwargs))


In [37]:
def process(batch):
    inputs = tokenizer(batch["0"], truncation=True, padding="max_length")
    return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "labels": batch["label"],
        }
    
tokenized_dataset = dataset.map(process, batched=True, remove_columns=dataset["train"].column_names)

                                                                     

In [40]:
trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["validation"],
            tokenizer=tokenizer,
        )
trainer.train()
#trainer.save_model("models/electra_classifier")


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1000,0.2871,0.268401,0.887112,0.886954,0.88908,0.887112
2000,0.2641,0.24478,0.89725,0.897189,0.898351,0.89725
3000,0.2119,0.247026,0.9003,0.900299,0.900341,0.9003
4000,0.1935,0.253305,0.899438,0.899356,0.900924,0.899438
5000,0.1629,0.313858,0.90085,0.900849,0.900853,0.90085




In [None]:
def inference(text, classes):
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt").to(model.device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return -1 if predicted_class == 0 else 1

inference("you are a good person", label_encoder.classes_)

1