In [1]:
import numpy as np
import torch
import pandas as pd
from datasets import ClassLabel, Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

i = 0
test_size = 0.15
num_classes = 2
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
set_class_weights = False
model_version = 'xlm-roberta-large'

train = pd.read_csv(f'../data_orig_splits/train_split_{i}.csv')
test = pd.read_csv(f'../data_orig_splits/test_split_{i}.csv')


data_ext = pd.read_parquet('../fb_Clean_Sub.parquet')[["text", "label"]]
train = pd.concat([train, data_ext], axis=0)
data_ext = pd.read_parquet('../RP-Mod-Crowd_Clean_Sub.parquet')[["text", "label"]]
train = pd.concat([train, data_ext], axis=0)
data_ext = pd.read_parquet('../german_hatespeech_refugees_Clean_Sub.parquet')[["text", "label"]]
train = pd.concat([train, data_ext], axis=0)

#pos_add = data_ext[data_ext["label"] == 1]
#neg_add = data_ext[data_ext["label"] == 0]
#neg_add = neg_add.sample(len(pos_add))
#add_data = pd.concat([pos_add, neg_add], axis=0)

train = Dataset.from_pandas(pd.DataFrame({'text': train["text"], 'label': train["label"]}))
test = Dataset.from_pandas(pd.DataFrame({'text': test["text"], 'label': test["label"]}))

# Load model and tokenizer

In [2]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_version)
model = AutoModelForSequenceClassification.from_pretrained(model_version, num_labels=num_classes).to(device)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokenize dataset

In [3]:
def tokenize_batch(batch):
    tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)
    return tokens
   
#     try:
#         tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)
#    # tokens['label'] = labels.str2int(batch['label'])
#         return tokens
#     except:
#         pass

train_tokenized = train.map(tokenize_batch, batched=True, batch_size=1000)
test_tokenized = test.map(tokenize_batch, batched=True, batch_size=1000)

train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/30638 [00:00<?, ? examples/s]

Map:   0%|          | 0/968 [00:00<?, ? examples/s]

# Train the model

In [4]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}


training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
#   warmup_steps=500,                # number of warmup steps for learning rate scheduler
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy='steps',
    eval_steps=300,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    save_total_limit=1,
    optim='adamw_torch',
    fp16=True,
    learning_rate=5e-6,
    save_strategy='steps',
    save_steps=300
)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_tokenized,         # training dataset
    eval_dataset=test_tokenized,             # evaluation dataset
    compute_metrics=compute_metrics
)


In [5]:
trainer.train()

Step,Training Loss,Validation Loss,F1,Accuracy
300,No log,0.65751,0.639076,0.612603
600,0.697900,0.599785,0.642202,0.677686
900,0.697900,0.579673,0.743979,0.703512
1200,0.625000,0.613624,0.736756,0.666322
1500,0.607600,0.56589,0.734109,0.693182
1800,0.607600,0.543381,0.764331,0.732438
2100,0.590600,0.547132,0.763508,0.724174
2400,0.590600,0.539134,0.775435,0.746901
2700,0.581000,0.521236,0.773946,0.756198
3000,0.573500,0.538113,0.774368,0.741736


In [None]:
torch.save(model.state_dict(), f"models/{model_version}_079__128_large_ds.pth")
model.load_state_dict(torch.load(f"models/{model_version}_079_128_large_ds.pth"))


<All keys matched successfully>

In [14]:
from torch.utils.data import DataLoader

eval_dataloader = DataLoader(test_tokenized, batch_size=8)

In [43]:

models_to_load = ["roberta_large_v1_0779.pth"]


for i, model_name in enumerate(models_to_load):
    model.load_state_dict(torch.load(f"models/{model_name}"))

    pred_probas = np.empty((0, 2))

    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        y_pred_proba = torch.softmax(logits, axis=1).cpu().numpy()
        pred_probas = np.vstack((pred_probas, y_pred_proba))

    if i == 0:
        all_preds = y_pred_proba
    else:
        all_preds += y_pred_proba

In [44]:
pred_probas

array([[0.03336831, 0.96663171],
       [0.92703366, 0.0729663 ],
       [0.97104686, 0.02895316],
       [0.03079216, 0.96920782],
       [0.98423058, 0.0157694 ],
       [0.05281403, 0.94718599],
       [0.0298448 , 0.97015518],
       [0.18469398, 0.81530607]])

In [41]:
np.vstack((pred_probasa, y_pred_proba))

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 0 and the array at index 1 has size 2

In [39]:
y_pred_proba = torch.softmax(logits, axis=1)

In [35]:
y_true = test_tokenized["label"].numpy()
f1_score(y_true, preds)

0.7762191048764197

In [27]:
preds

array([], dtype=float64)

In [26]:
np.append(preds, predictions.cpu().numpy(), axis=0)

array([1., 0., 0., 1., 0., 1., 1., 1.])