In [21]:
!pip install transformers==4.17

Collecting transformers==4.17
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting sacremoses (from transformers==4.17)
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sacremoses, transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.32.1
    Uninstalling transformers-4.32.1:
      Successfully uninstalled transformers-4.32.1
Successfully installed sacremoses-0.1.1 transformers-4.17.0


In [13]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer, Trainer, TrainingArguments, ElectraTokenizer, ElectraForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import torch
from accelerate import Accelerator
import os


In [14]:
model_name = 'vinai/bertweet-base'

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device=device)
print(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cpu


In [16]:
training_args = TrainingArguments(
            output_dir='./results',
            do_eval=True,
            do_train=True,
            num_train_epochs=4,
            save_total_limit=4,
            load_best_model_at_end=True,
            learning_rate=1e-04,
            per_device_train_batch_size=64,
            per_device_eval_batch_size=64,
            save_strategy="steps",
            logging_strategy="steps",
            evaluation_strategy="steps",
            logging_steps=1000,
            eval_steps=1000,
            save_steps=1000,
            optim="adamw_hf",
        )

def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(
                labels, preds, average="weighted"
            )
            acc = accuracy_score(labels, preds)
            return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [17]:
pos = pd.read_csv('data/train_pos.txt', delimiter='\t', header=None).sample(11000, random_state=4)
pos['label'] = 1
neg = pd.read_csv('data/train_neg.txt', delimiter='\t', header=None).sample(11000, random_state=4)
neg['label'] = -1

train_test_split = 0.8

full_df = pd.concat([pos, neg]).sample(frac=1, random_state=42)
train_df = full_df[:int(len(full_df)*train_test_split)]
dev_df = train_df.sample(frac=0.2, random_state=42).rename(columns={0: 'tweet'})
train_df = train_df.drop(dev_df.index).rename(columns={0: 'tweet'})
test_df = full_df[int(len(full_df)*train_test_split):].rename(columns={0: 'tweet'})
# test_df = pd.read_csv('data/test_data.txt', delimiter='\t', header=None).rename(columns={0: 'tweet'})
validation_df = test_df.copy()
test_df['label'] = -2

In [18]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])
train_df['label'] = label_encoder.transform(train_df['label'])
dev_df['label'] = label_encoder.transform(dev_df['label'])

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(dev_df),
    #'test': Dataset.from_pandas(test_df)
})

In [19]:
def process(batch):
    inputs = tokenizer(batch["tweet"], truncation=True, padding="max_length")
    return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "labels": batch["label"],
        }
    
tokenized_dataset = dataset.map(process, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/13328 [00:00<?, ? examples/s]

Map:   0%|          | 0/3520 [00:00<?, ? examples/s]

In [20]:
trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["validation"],
            tokenizer=tokenizer,
        )
trainer.train()
#trainer.save_model("models/electra_classifier")




  0%|          | 0/836 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def inference(text, classes):
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt").to(model.device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return -1 if predicted_class == 0 else 1

inference("you are a good person", label_encoder.classes_)

1