In [None]:
from datasets import load_dataset
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments, 
                          Trainer)
import numpy as np
import evaluate
import logging
from typing import Union
import os

# for updating an external results .csv file
import pandas as pd
from sklearn.metrics import confusion_matrix

In [None]:
"""
Runs fine-tuning once. Preprocesses data, then creates and trains a Trainer object (be sure to specify TrainingArguments).
Returns a trained model, based on TrainingArguments. Metrics are output to an external `results.csv` in the same directory.

ARGS:
    - trainfile (str): the .csv file containing the training data
    - testfile (str): the .csv file containing the training data
    - output_dir (str): the output directory where saved models will be output, if saving is enabled
    - MODEL_NAME (str): name of the HuggingFace model (https://huggingface.co/) to be loaded and fine-tuned
    - TRAINING_ARGS (TrainingArguments): set of parameters for training
    - logger (Union[logging.Logger, None]), optional: a Logger object which will log certain steps
    - seed (int): random seed 

RETURNS: 
    - model (AutoModelForSequenceClassification): the trained model, based on training_args
"""
def run_trainer(trainfile: str, testfile: str, output_dir: str, 
                MODEL_NAME: str, TRAINING_ARGS: TrainingArguments,
                logger: Union[logging.Logger, None], 
                seed: int = 42) -> AutoModelForSequenceClassification:
    
    log = logging.getLogger(__name__) if logger is None else logger
    
    # sanity checks
    for i in [trainfile, testfile, output_dir]:
        assert os.path.exists(i), f"File/Directory {i} does not exist"
    
    # log.info('loading dataset...')
    dataset = load_dataset("csv", data_files={"train": trainfile, 
                                              "test": testfile})

    # define tokenizer and tokenize datasets
    # log.info('loading tokenizer...')
    tokenizer = AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'), max_length=512)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    # log.info('tokenizing datasets...')
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # load training arguments
    training_args = TRAINING_ARGS
    
    # load model
    # log.info('loading model...')
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

    # define evaluation metrics
    metric_f1 = evaluate.load("f1")
    metric_pr = evaluate.load("precision")
    metric_re= evaluate.load("recall")
    # metric_acc = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        
        f1 = metric_f1.compute(predictions=predictions, 
                               references=labels, 
                               average='macro')
        
        recall = metric_re.compute(predictions=predictions, 
                                   references=labels)
        
        precision = metric_pr.compute(predictions=predictions, 
                                      references=labels)
        
        # ***** update an external results .csv file ***** # 
        tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
        if ('results.csv' not in os.listdir()):
            df = pd.DataFrame(columns=['f1', 'precision', 'recall', 'tn', 'fp', 'fn', 'tp', 'preds'])
        else:
            df = pd.read_csv('results.csv', index_col=0)
        df.loc[len(df.index)] = [f1['f1'], precision['precision'], recall['recall'], tn, fp, fn, tp, predictions]
        df.to_csv('results.csv') 
        # ***** update an external results .csv file ***** # 

        return f1
    
    # define test and train splits
    train_dataset = tokenized_datasets["train"]
    test_dataset = tokenized_datasets["test"]

    # creates and trains a Trainer object
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

    # save model
    trainer.save_model(output_dir)
    
    return model

In [None]:
# define training args and model
training_args = TrainingArguments(output_dir=output_dir, 
                                  evaluation_strategy="epoch", 
                                  num_train_epochs=10, 
                                  learning_rate=1e-5, 
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  # per_device_eval_batch_size=16,
                                  logging_strategy = 'epoch',
                                  # logging_first_step = True,
                                  save_strategy = 'no',
                                  # save_strategy = 'epoch',
                                  # save_total_limit = 1,
                                  # load_best_model_at_end = True,
                                  # metric_for_best_model = 'f1'
                                 )

model_name = 'bert-base-multilingual'

In [None]:
# fine-tune on separate train-test files
trainfile_prefix = 'train_'
testfile_prefix = 'test_'

num_tests = 5

for x in range(0, num_tests):
    train_path = trainfile_prefix + str(x) + '.csv'
    test_path = testfile_prefix + str(x) + '.csv'
    model_output_dir = 'output' # creates directory for saved models
    
    model = run_trainer(train_path, 
                    test_path, 
                    model_output_dir 
                    # logger
                   )