In [6]:
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast
from transformers import BertConfig
import os


def load_llm(models_dir: str, model_name: str, tokenizer_name: str,
             model_hub: str):
    full_model_path = os.path.join(models_dir, model_name)
    full_tokenizer_path = os.path.join(models_dir, tokenizer_name)

    if os.path.isdir(full_model_path):
        print("model is already saved, so load it locally from disk")
        model = AutoModelForSequenceClassification.from_pretrained(full_model_path,
                                                                   torchscript=True)
    else:
        config = BertConfig.from_pretrained(model_hub)
        config.num_labels = 2
        config.return_dict = True
        config.torchscript=True

        model = AutoModelForSequenceClassification.from_pretrained(model_hub,
                                                                   config=config,
                                                                   ignore_mismatched_sizes=True)
        for param in model.base_model.parameters():
            param.requires_grad = False
        
        model.save_pretrained(full_model_path)


    if os.path.isdir(full_tokenizer_path):
        print("tokenizer is already saved, so load it locally from disk")
        tokenizer = BertTokenizerFast.from_pretrained(full_tokenizer_path)
    else:
        tokenizer = BertTokenizerFast.from_pretrained(model_hub)
        tokenizer.save_pretrained(full_tokenizer_path)

    return model, tokenizer

In [7]:
import pandas as pd
import numpy as np
import torch
import os
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import evaluate
from datasets.arrow_dataset import Dataset

##### 1. Prepare model

In [8]:
models_dir = "new_models"

In [9]:
model, tokenizer = load_llm(
    models_dir, "model", "tokenizer", "blanchefort/rubert-base-cased-sentiment"
)

model is already saved, so load it locally from disk
tokenizer is already saved, so load it locally from disk


##### 2. Prepare dataset

In [10]:
dataset_dir = "data"
dataset_name = "main_dataset.csv"

In [13]:
dataset: pd.DataFrame = pd.read_csv(os.path.join(dataset_dir, dataset_name))

In [14]:
dataset['toxic'] = dataset['toxic'].astype(np.int32)
dataset = dataset.rename(columns={"toxic": "labels"})
dataset = dataset.drop_duplicates(subset=["comment"])

In [15]:
train_df, test_df = train_test_split(dataset, random_state=42, test_size=0.2)
test_df, val_df = train_test_split(test_df, random_state=42, test_size=0.5)

In [16]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
validation_dataset = Dataset.from_pandas(val_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

In [None]:
def check_tokenization(samples):
    return tokenizer(
        samples["comment"], padding=False, truncation=False, return_tensors="np"
    )


train_dataset_check_tokenized = train_dataset.map(check_tokenization, batched=True)
validation_dataset_check_tokenized = validation_dataset.map(
    check_tokenization, batched=True
)
test_dataset_check_tokenized = test_dataset.map(check_tokenization, batched=True)

In [18]:
train_dataset_check_tokenized_df = train_dataset_check_tokenized.to_pandas()
validation_dataset_check_tokenized_df = validation_dataset_check_tokenized.to_pandas()
test_dataset_check_tokenized_df = test_dataset_check_tokenized.to_pandas()

In [19]:
columns_to_drop = ["input_ids", "token_type_ids", "attention_mask"]
threshold = 512
train_dataset = Dataset.from_pandas(
    train_dataset_check_tokenized_df[
        train_dataset_check_tokenized_df["input_ids"].apply(
            lambda x: len(x) < threshold
        )
    ], preserve_index=False
).remove_columns(columns_to_drop)

validation_dataset = Dataset.from_pandas(
    validation_dataset_check_tokenized_df[
        validation_dataset_check_tokenized_df["input_ids"].apply(
            lambda x: len(x) < threshold
        )
    ],
    preserve_index=False,
).remove_columns(columns_to_drop)

test_dataset = Dataset.from_pandas(
    test_dataset_check_tokenized_df[
        test_dataset_check_tokenized_df["input_ids"].apply(lambda x: len(x) < threshold)
    ],
    preserve_index=False,
).remove_columns(columns_to_drop)

In [None]:
def tokenize(samples):
    return tokenizer(
        samples["comment"],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )


train_dataset_tokenized = train_dataset.map(tokenize, batched=True)
validation_dataset_tokenized = validation_dataset.map(tokenize, batched=True)
test_dataset_tokenized = test_dataset.map(tokenize, batched=True)

In [21]:
train_dataset_tokenized = train_dataset_tokenized.remove_columns(["comment"])
validation_dataset_tokenized = validation_dataset_tokenized.remove_columns(["comment"])
test_dataset_tokenized = test_dataset_tokenized.remove_columns(["comment"])

##### 3. Finetune model

In [22]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [23]:
train_dataset_tokenized.set_format(type='torch',
                                   columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dataset_tokenized.set_format(type='torch',
                                  columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
validation_dataset_tokenized.set_format(type='torch',
                                        columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [24]:
batch_size = 64
train_dataloader = DataLoader(train_dataset_tokenized, shuffle=True, batch_size=batch_size)
validation_dataloader = DataLoader(validation_dataset_tokenized, shuffle=False, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset_tokenized, shuffle=False, batch_size=batch_size)

In [25]:
metric_names = ["loss", "accuracy", "recall", "precision"]

accuracy = evaluate.load('accuracy')
recall = evaluate.load('recall')
precision = evaluate.load('precision')

In [26]:
reduction = 'mean'
cross_entropy = torch.nn.CrossEntropyLoss(reduction=reduction)


def calc_metrics(model, device: str, data_loader, metrics_dict: dict, enable_progress_bar: bool = True):
    model.eval()
    progress_bar = tqdm(range(len(data_loader))) if enable_progress_bar else None

    for batch in data_loader:
        with torch.no_grad():
            batch["labels"] = batch["labels"].to(device)
            batch["input_ids"] = batch["input_ids"].to(device)
            batch["token_type_ids"] = batch["token_type_ids"].to(device)
            batch["attention_mask"] = batch["attention_mask"].to(device)
            outputs = model(**batch, return_dict=True)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(predictions, dim=-1)

            accuracy.add_batch(predictions=predictions, references=batch["labels"])
            recall.add_batch(predictions=predictions, references=batch["labels"])
            precision.add_batch(predictions=predictions, references=batch["labels"])
            progress_bar.update(1)
    metrics_dict['accuracy'].append([accuracy.compute()['accuracy']])
    metrics_dict['recall'].append(recall.compute(average=None, zero_division=1)['recall'])
    metrics_dict['precision'].append(precision.compute(average=None, zero_division=1)['precision'])


def print_metrics(metrics:dict, mode:str, epoch: int):
    if epoch > 0:
        print(f"epoch {epoch}:")
    print(f"\t{mode}:")

    for metric, values in metrics.items():
        formatted_values = ", ".join(f"{round(value, 5)}" for value in values[-1])
        print(f"\t\t{metric.capitalize()}: [{formatted_values}]")


def train_loop(model, device: str, data_loader, optimizer, train_loss, metrics_dict: dict,
               progress_bar: tqdm=None, max_norm: float=1.0):
    model.train(True)  
    epoch_loss = 0
    num_batches = len(data_loader)

    for batch in data_loader:
        optimizer.zero_grad()
        batch["labels"] = batch["labels"].to(device)
        batch["input_ids"] = batch["input_ids"].to(device)
        batch["token_type_ids"] = batch["token_type_ids"].to(device)
        batch["attention_mask"] = batch["attention_mask"].to(device)

        outputs = model(**batch, return_dict=True)  
        loss = train_loss(outputs.logits, batch['labels'])
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm)
        optimizer.step()

        if progress_bar is not None:
            progress_bar.update(1)
        epoch_loss += loss.item()

    calc_metrics(model, device, data_loader, metrics_dict)
    metrics_dict['loss'].append([epoch_loss / num_batches]) 

    return metrics_dict


def train(model, epoch_count: int, train_data_loader,
          validation_dataloader, device: str, train_loss,
          learning_rate: float, enable_progress_bar: bool=False):
    model.to(device)
    num_training_steps = epoch_count * len(train_data_loader)
    progress_bar = tqdm(range(num_training_steps)) if enable_progress_bar else None

    train_metrics_dict = {metric: list() for metric in metric_names}
    validation_metrics_dict = {metric: list() for metric in metric_names}
    validation_metrics_dict.pop('loss')

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(epoch_count):
        train_loop(model, device, train_data_loader, optimizer, train_loss, train_metrics_dict, progress_bar)
        calc_metrics(model, device, validation_dataloader, validation_metrics_dict)
        print_metrics(train_metrics_dict, "train", epoch + 1)
        print_metrics(validation_metrics_dict, "validation", -1)

    return train_metrics_dict, validation_metrics_dict

In [None]:
epoch_count = 10
learning_rate = 1e-6
device = 'cuda'
train_metrics, validation_metrics = train(model, epoch_count,
                                          train_dataloader, validation_dataloader,
                                          device, cross_entropy, learning_rate, True)

In [None]:
model.to('cpu')
model.save_pretrained(models_dir + "tone-model-fine-tune-acc_" + "final_model")

##### 4. Test model

In [None]:
model_name = "tone-model-fine-tune-acc_final_model"
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_dir, model_name), torchscript=True)
tokenizer = BertTokenizerFast.from_pretrained(os.path.join(models_dir, "tokenizer"))

In [28]:
test_dataset_metrics_dict = {metric: list() for metric in metric_names}
test_dataset_metrics_dict.pop('loss')
fine_tuned_model.to(device)
calc_metrics(fine_tuned_model, test_dataloader, test_dataset_metrics_dict)
print_metrics(test_dataset_metrics_dict, 'test', -1)

100%|██████████| 54/54 [03:27<00:00,  3.84s/it]

	test:
		Accuracy: [0.71076]
		Recall: [0.63451, 0.81161]
		Precision: [0.81669, 0.62669]



