In [1]:
import mlflow
from mlflow.models import infer_signature

import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, TrainingArguments, Trainer, DataCollatorWithPadding

### Define functions

In [2]:
def set_mlflow_experiment(tracking_url, experiment_name):

    # Set the tracking uri  and the active experiment 
    mlflow.set_tracking_uri(tracking_url)

    # Set the current active experiment and return the experiment metadata
    return mlflow.set_experiment(experiment_name)

In [3]:
def calculate_metrics(y_pred, y_test):

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_balanced = balanced_accuracy_score(y_test, y_pred)

    # Calculate metrics for negative class
    precision_neg = precision_score(y_test, y_pred, pos_label = 0)
    recall_neg = recall_score(y_test, y_pred, pos_label = 0)
    f1_neg = f1_score(y_test, y_pred, pos_label = 0)

    # Calculate metrics for positive class
    precision_pos = precision_score(y_test, y_pred, pos_label = 1)
    recall_pos = recall_score(y_test, y_pred, pos_label = 1)
    f1_pos = f1_score(y_test, y_pred, pos_label = 1)

    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Extract TP, FP, TN, FN from confusion matrix
    tn, fp, fn, tp = cm.ravel()

    return accuracy, accuracy_balanced, precision_neg, recall_neg, f1_neg, precision_pos, recall_pos, f1_pos, tp, fp, tn, fn

In [4]:
def log_metrics_to_mlflow(metrics):

    # Log the accuracy
    mlflow.log_metric('accuracy', metrics[0])
    mlflow.log_metric('accuracy_balanced', metrics[1])

    # Log metrics for negative classes
    mlflow.log_metric('precision_neg', metrics[2])
    mlflow.log_metric('recall_neg', metrics[3])
    mlflow.log_metric('f1_neg', metrics[4])

    # Log metrics for positive classes
    mlflow.log_metric('precision_pos', metrics[5])
    mlflow.log_metric('recall_pos', metrics[6])
    mlflow.log_metric('f1_pos', metrics[7])

    # Log the confusion matrix elements
    mlflow.log_metric('tp', metrics[8])
    mlflow.log_metric('fp', metrics[9])
    mlflow.log_metric('tn', metrics[10])
    mlflow.log_metric('fn', metrics[11])

In [5]:
def log_model_to_mlflow(model, artifact_path, train_dataset, run_name):

    # Infer the model signature
    signature = infer_signature(train_dataset, model.predict(train_dataset))

    # Log the model to mlflow
    mlflow.pytorch.log_model(
        pytorch_model = model,
        artifact_path = artifact_path,
        signature = signature,
        input_example = train_dataset,
        registered_model_name = run_name,
    )

In [6]:
def prepare_and_split_dataset(csv_name, path):

    # Read the excel file into the corresponding DataFrame
    dataset = pd.read_csv(path + csv_name)

    # Replace sentiment values with binary values
    dataset['sentiment'] = dataset['sentiment'].replace({'negative':0, 'positive':1})

    # Rename columns in the DataFrame
    dataset = dataset.rename(columns={'review' : 'text', 'sentiment' : 'labels'})

    # Split data into training and testing datasets
    train_data, test_data = train_test_split(dataset, test_size = 0.2, random_state = 43)

    # Further split training data into train and validation sets
    train_data, val_data = train_test_split(train_data, test_size = 0.1, random_state = 43)

    # Create a Dataset dictionary and convert dataframes to datasets
    combined_dataset = DatasetDict({
        'train': Dataset.from_pandas(train_data),
        'val': Dataset.from_pandas(val_data),
        'test': Dataset.from_pandas(test_data)
    })

    return combined_dataset

In [7]:
def tokenize_data(combined_dataset, tokenizer):

    # Tokenize function
    def tokenize_function(examples):
        # max_length padding ensures same length of each sequence within a batch, truncation parameter doesn't play a role
        return tokenizer(examples['text'], truncation = True, padding = 'max_length')

    # Tokenize the combined dataset, batched parameter tokenizes multiple samples simultaneously
    tokenized_dataset = combined_dataset.map(tokenize_function, batched = True)

    # Return small subsets of dataset for testing
    train_dataset = tokenized_dataset['train'].shuffle(seed = 42).select(range(10))
    eval_dataset = tokenized_dataset['val'].shuffle(seed = 42).select(range(10))
    test_dataset = tokenized_dataset['test'].shuffle(seed = 42).select(range(10))
    
    # Uncomment Return split datasets
    #train_dataset = tokenized_dataset['train']
    #eval_dataset = tokenized_dataset['val']
    #test_dataset = tokenized_dataset['test']

    return train_dataset, eval_dataset, test_dataset

In [9]:
def initialize_trainer_and_model(train_dataset, eval_dataset, tokenizer, checkpoint, training_args):

    # Create a data collator for padding
    data_collator = DataCollatorWithPadding(tokenizer)

    # Load the pretrained BERT model for sequence classification
    model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)
    
    # Initialize the Trainer
    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = eval_dataset,
        tokenizer = tokenizer,
        data_collator = data_collator,
    )
    
    return trainer, model

In [10]:
def ml_test_and_log(trainer, model, data_name, artifact_path, train_dataset, test_dataset):
    
    # Start an MLflow run using the previously defined name
    with mlflow.start_run(run_name = 'bert'):
      
        # Log additional useful parameters
        mlflow.log_param('data_name', data_name)

        # Train the model using the pretrained trainer
        trainer.train()

        # Make predictions on the test dataset, y_pred contains list of probabilities for each sentiment class 
        # For example - [0.85, 0.15] Probability for positive sentiment is 85% and for negative is 15%
        y_pred = trainer.predict(test_dataset)

        # Extract predicted lables, get the class with highest probability
        preds = np.argmax(y_pred.predictions, axis=-1)

        # DELETE THIS
        print('y_pred is: ', y_pred)
        print('preds is: ', preds)
        print('y_pred.label_ids is: ', y_pred.label_ids)
        print('y_pred.predictions is: ', y_pred.predictions)
        
        # Calculate metrics by comparing real label and predicted label
        metrics = calculate_metrics(y_pred.label_ids, preds)
        
        # Log metrics to mlflow
        log_metrics_to_mlflow(metrics)
        
        # Log the model to mlflow
        log_model_to_mlflow(model, artifact_path, train_dataset, 'bert')


### Set up the variables

In [11]:
# Set the name of the pretrained bert model that will be used
checkpoint = 'bert-base-cased'

# Set parameters needed to connect to mlflow
# tracking_url, experiment_name = 'http://127.0.0.1:8088', 'aleksa_praksa'
tracking_url, experiment_name = 'http://192.168.66.221:20002', 'aleksa_praksa'

# Set folder path to the original dataset
path, file_name, artifact_path = '../data/', 'imdb_dataset.csv', 'artifact'

# Set up training arguments
training_args = TrainingArguments(
    output_dir = 'output',
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 3,
    logging_dir = './logs',
    logging_steps = 100,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss", 
)

### Run everything

In [None]:
# Set up the mlflow experiment
set_mlflow_experiment(tracking_url, experiment_name)

# Preprocess dataset
combined_dataset = prepare_and_split_dataset(file_name, path)

# Tokenize data
tokenizer = BertTokenizer.from_pretrained(checkpoint)
train_dataset, eval_dataset, test_dataset = tokenize_data(combined_dataset, tokenizer)

# Initialize the trainer
trainer, model = initialize_trainer_and_model(train_dataset, eval_dataset, tokenizer, checkpoint, training_args)

# Train the classifier and test it on the test dataset while logging relevant information
ml_test_and_log(trainer, model, file_name, artifact_path, train_dataset, test_dataset)