In [1]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer, Trainer, TrainingArguments, ElectraTokenizer, ElectraForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import torch
from accelerate import Accelerator
import os


In [2]:
model_name = 'howey/electra-base-mnli'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True, classifier_dropout=0.1)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device=device)
print(device)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at howey/electra-base-mnli and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cpu


In [15]:
training_args = TrainingArguments(
            output_dir='./results',
            do_eval=True,
            do_train=True,
            num_train_epochs=6,
            save_total_limit=2,
            load_best_model_at_end=True,
            learning_rate=8.5e-05,
            per_device_train_batch_size=12,
            per_device_eval_batch_size=12,
            save_strategy="steps",
            logging_strategy="steps",
            evaluation_strategy="steps",
            logging_steps=50,
            eval_steps=50,
            save_steps=50,
        )

def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(
                labels, preds, average="weighted"
            )
            acc = accuracy_score(labels, preds)
            return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [69]:
pos = pd.read_csv('data/train_pos.txt', delimiter='\t', header=None)  
pos['label'] = 1
neg = pd.read_csv('data/train_neg.txt', delimiter='\t', header=None)
neg['label'] = -1

train_df = pd.concat([pos, neg]).sample(frac=1, random_state=42)
dev_df = train_df.sample(frac=0.2, random_state=42)
train_df = train_df.drop(dev_df.index)
test_df = pd.read_csv('data/test_data.txt', delimiter='\t', header=None)

In [74]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])
train_df['label'] = label_encoder.transform(train_df['label'])
dev_df['label'] = label_encoder.transform(dev_df['label'])

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(dev_df),
    #'test': Dataset.from_pandas(test_df)
})

  if _pandas_api.is_sparse(col):
  return cls(pa.Table.from_pandas(*args, **kwargs))


In [75]:
def process(batch):
    inputs = tokenizer(batch["0"], truncation=True, padding="max_length")
    return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "labels": batch["label"],
        }
    
tokenized_dataset = dataset.map(process, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/126047 [00:00<?, ? examples/s]

Map:   0%|          | 0/39394 [00:00<?, ? examples/s]

In [78]:
trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["validation"],
            tokenizer=tokenizer,
        )
#trainer.train()
#trainer.save_model("models/electra_classifier")


In [81]:
def inference(text, classes):
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt").to(model.device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return -1 if predicted_class == 0 else 1

inference("you are a good person", label_encoder.classes_)

1