In [1]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer, Trainer, TrainingArguments, ElectraTokenizer, ElectraForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import torch
from accelerate import Accelerator
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'vinai/bertweet-base'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device=device)
print(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


In [4]:
training_args = TrainingArguments(
            output_dir='./results',
            do_eval=True,
            do_train=True,
            num_train_epochs=3,
            save_total_limit=3,
            load_best_model_at_end=True,
            learning_rate=1e-04,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=32,
            save_strategy="steps",
            logging_strategy="steps",
            evaluation_strategy="steps",
            logging_steps=1000,
            eval_steps=1000,
            save_steps=1000,
            optim="adamw_hf",
        )

def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(
                labels, preds, average="weighted"
            )
            acc = accuracy_score(labels, preds)
            return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [5]:
pos = pd.read_csv('data/train_pos_full.txt', delimiter='\t', header=None).sample(400000, random_state=2)
pos['label'] = 1
neg = pd.read_csv('data/train_neg_full.txt', delimiter='\t', header=None).sample(400000, random_state=2)
neg['label'] = 0

train_df = pd.concat([pos, neg]).sample(frac=1, random_state=42)
dev_df = train_df.sample(frac=0.2, random_state=42)
train_df = train_df.drop(dev_df.index)
test_df = pd.read_csv('data/test_data.txt', delimiter='\t', header=None)
test_df['label'] = -100

In [6]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])
train_df['label'] = label_encoder.transform(train_df['label'])
dev_df['label'] = label_encoder.transform(dev_df['label'])

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)
})

  return cls(pa.Table.from_pandas(*args, **kwargs))


In [7]:
def process(batch):
    inputs = tokenizer(batch["0"], truncation=True, padding="max_length")
    return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "labels": batch["label"],
        }
    
tokenized_dataset = dataset.map(process, batched=True)

                                                                     

In [8]:
trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["validation"],
            tokenizer=tokenizer,
        )
trainer.train()
trainer.save_model("models/classifier")


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mluca-mouchel[0m ([33mlia_epfl[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1000,0.323,0.290432,0.879238,0.879209,0.879576,0.879238
2000,0.293,0.290569,0.884206,0.88412,0.88539,0.884206
3000,0.2846,0.274878,0.885581,0.88558,0.885598,0.885581
4000,0.2751,0.260371,0.888387,0.888383,0.888447,0.888387
5000,0.2769,0.297346,0.889463,0.88944,0.889767,0.889463
6000,0.2737,0.262554,0.888581,0.888548,0.889029,0.888581
7000,0.2691,0.296231,0.888369,0.888132,0.891744,0.888369
8000,0.2627,0.255915,0.892537,0.892469,0.893569,0.892537
9000,0.2461,0.268279,0.894269,0.894265,0.894325,0.894269
10000,0.2139,0.257937,0.893775,0.893768,0.893868,0.893775




In [12]:
a = trainer.predict(tokenized_dataset["test"])




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
preds = torch.argmax(torch.softmax(torch.tensor(a.predictions), dim=1), dim=1).cpu().numpy()
labels = [-1 if label == 0 else label for label in preds]  
ids = [i for i in range(1, len(labels)+1)]
print("Saving at: ", f'submit.csv')
pd.DataFrame(zip(ids, labels)).to_csv(f'submit.csv', index=False, header=['Id', 'Prediction'])


Saving at:  submit.csv


In [11]:
def inference(text, classes):
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt").to(model.device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return -1 if predicted_class == 0 else 1

inference("you are a good person", label_encoder.classes_)


1