In [None]:
%pip install --upgrade transformers Keras-Preprocessing wandb pytorch-lightning sacremoses sentencepiece --quiet
import os
import time
import json
import torch
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import (accuracy_score, f1_score, recall_score, precision_score, confusion_matrix)


In [None]:
import wandb
key="d75571bf9259088cd0a735d5f9e10de08e105a99"
wandb.login(key=key)


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')


In [None]:
def model_training(model_type, batch_size, model_name, lr, epochs, file_name, MODEL_CLASSES, token_length):
    df = pd.read_csv(file_name)
    train_set = df[df['split'] == 'train']
    test_set = df[df['split'] == 'test']
    validation_set = df[df['split'] == 'val']

    model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]
    tokenizer = tokenizer_class.from_pretrained(model_name, from_tf=True)
    def input_id_maker(dataf, tokenizer):
        input_ids = []
        lengths = []

        for i in range(len(dataf)):
            sen = dataf['text'].iloc[i]
            sen = tokenizer.tokenize(sen)
            # sen = tokenizer.tokenize(sen, add_prefix_space=True)
            CLS = tokenizer.cls_token
            SEP = tokenizer.sep_token
            if (len(sen) > 510):
                sen = sen[len(sen)-510:]

            sen = [CLS] + sen + [SEP]
            encoded_sent = tokenizer.convert_tokens_to_ids(sen)
            input_ids.append(encoded_sent)
            lengths.append(len(encoded_sent))

        input_ids = pad_sequences(
            input_ids, maxlen=token_length, value=0, dtype="long", truncating="pre", padding="post")
        return input_ids, lengths

    start_time = time.time()

    train_input_ids, train_lengths = input_id_maker(train_set, tokenizer)
    validation_input_ids, validation_lengths = input_id_maker(validation_set, tokenizer)

    elapsed_time = time.time() - start_time

    print(f"Time taken for input_id_maker: {elapsed_time:.2f} seconds\n")

    print(f"Tokenization of data using {model_name} is done\n")

    def att_masking(input_ids):
        attention_masks = []
        for sent in input_ids:
            att_mask = [int(token_id > 0) for token_id in sent]
            attention_masks.append(att_mask)
        return attention_masks

    train_attention_masks = att_masking(train_input_ids)
    validation_attention_masks = att_masking(validation_input_ids)

    train_labels = train_set['label'].to_numpy().astype('int')
    validation_labels = validation_set['label'].to_numpy().astype('int')

    print(f"Masking of tokenizers is done\n")

    train_inputs = train_input_ids
    validation_inputs = validation_input_ids
    train_masks = train_attention_masks
    validation_masks = validation_attention_masks

    train_inputs = torch.tensor(train_inputs)
    train_labels = torch.tensor(train_labels)
    train_masks = torch.tensor(train_masks)
    validation_inputs = torch.tensor(validation_inputs)
    validation_labels = torch.tensor(validation_labels)
    validation_masks = torch.tensor(validation_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = batch_size)
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = RandomSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size = batch_size)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model_class.from_pretrained(model_name, num_labels=2, from_tf=True)
    model.to(device)

    max_grad_norm = 1.0
    num_total_steps = len(train_dataloader)*epochs
    num_warmup_steps = 1000
    warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
    optimizer = AdamW(model.parameters(), lr=lr, correct_bias=True)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_total_steps)

    def flat_accuracy(preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

    seed_val = 2212

    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    checkpoint_folder = f"CJPR/Transformers_GPU/{model_type}/{model_name}_L{lr}_E{epochs}_B{batch_size}"

    if not os.path.exists(checkpoint_folder):
        os.makedirs(checkpoint_folder)

    # os.makedirs(checkpoint_folder, exist_ok=True)

    # def save_checkpoint(epoch, model, optimizer, scheduler, loss_values):
    #     checkpoint = {
    #         'epoch': epoch,
    #         'model_state_dict': model.state_dict(),
    #         'optimizer_state_dict': optimizer.state_dict(),
    #         'scheduler_state_dict': scheduler.state_dict(),
    #         'loss_values': loss_values,
    #     }
    #     checkpoint_file = os.path.join(checkpoint_folder, f'checkpoint_epoch_{epoch}.pt')
    #     torch.save(checkpoint, checkpoint_file)

    # def load_latest_checkpoint(model, optimizer, scheduler, loss_values):
    #     checkpoint_files = [f for f in os.listdir(checkpoint_folder) if f.startswith('checkpoint_epoch_') and f.endswith('.pt')]
    #     if checkpoint_files:
    #         latest_checkpoint = max(checkpoint_files)
    #         checkpoint_file = os.path.join(checkpoint_folder, latest_checkpoint)
    #         checkpoint = torch.load(checkpoint_file)
    #         model.load_state_dict(checkpoint['model_state_dict'])
    #         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    #         scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    #         loss_values.extend(checkpoint['loss_values'])
    #         return checkpoint['epoch']
    #     else:
    #         return 0

    loss_values = []
    # checkpoint_epoch = load_latest_checkpoint(model, optimizer, scheduler, loss_values)

    print(f"Now Training for {model_name} is Started.......\n")

    # for epoch_i in range(checkpoint_epoch, epochs):
    for epoch_i in range(0, epochs):
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        t0 = time.time()
        total_loss = 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            if step % 100 == 0 and not step == 0:
                print('  Batch {:>5,}  of  {:>5,}. '.format(step, len(train_dataloader)))

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            # outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

            loss = outputs[0]
            total_loss += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))

        print("")
        print("Running Validation...")

        t0 = time.time()

        model.eval()

        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                # outputs = model(b_input_ids, attention_mask=b_input_mask)

            logits = outputs[0].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        avg_eval_accuracy = eval_accuracy / nb_eval_steps

        print("  Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps))
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        macro_f1 = f1_score(pred_flat, labels_flat, average='macro')
        micro_f1 = f1_score(pred_flat, labels_flat, average='micro')
        accuracy = accuracy_score(pred_flat, labels_flat)
        precision = precision_score(pred_flat, labels_flat)
        recall = recall_score(pred_flat, labels_flat)
        confusion = confusion_matrix(labels_flat, pred_flat)
        epoch_metrics = {
            'epoch': epoch_i,
            'macro_f1': macro_f1,
            'micro_f1': micro_f1,
            "flat_accuracy":avg_eval_accuracy,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            "Avg Train Loss": avg_train_loss,
            'confusion_matrix': confusion.tolist()
        }

        print(f"epoch={epoch_i}, macro_f1: {macro_f1}, micro_f1: {micro_f1}, accuracy={accuracy}, precision: {precision}, recall: {recall}, loss={loss}")

        # Save the metrics to a JSON file for this epoch
        epochs_folder = f"{checkpoint_folder}/epochs"
        if not os.path.exists(epochs_folder):
            os.makedirs(epochs_folder)

        with open(f'{epochs_folder}/epoch{epoch_i}_metrics.json', 'w') as json_file:
            json.dump(epoch_metrics, json_file, indent=4)
        
        print(f"epoch_{epoch_i}_metrics.json saved to {epochs_folder}\n")

        wandb.log({"Flat Accuracy":avg_eval_accuracy, "Accuracy":accuracy , "Macro_f1":macro_f1,"Micro_f1":micro_f1, "Precision":precision, "Recall":recall, "Avg Train Loss": avg_train_loss})

        print(f"epoch_{epoch_i} logging is done...\n")

        # save_checkpoint(epoch_i, model, optimizer, scheduler, loss_values)
    wandb.finish()

    print(f"Now Training for {model_name} is Completed.......\n")

    print("Saving model to %s\n" % checkpoint_folder)

    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(checkpoint_folder)
    tokenizer.save_pretrained(checkpoint_folder)

    print("Model Saved to %s\n" % checkpoint_folder)


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig, XLMRobertaTokenizer
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer, DebertaV2Config

MODEL_CLASSES = {
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMRobertaTokenizer, XLMConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig),
    'deberta': (DebertaV2ForSequenceClassification, DebertaV2Tokenizer, DebertaV2Config)
}

model_type = 'xlm' 

token_length = [512, 1024, 2048]
batch_size = [16, 32]
lr = [1e-6, 1e-5, 1e-7]
epochs = [10,15,20]
file_name = 'data.csv'
model_name = 'xlm-roberta-base'


for token_length in token_length:
    for batch_size in batch_size:
        for lr in lr:
            for epochs in epochs:
                wandb.init(
                    project=f"{model_type}",
                    name=f"{model_name}_L{lr}_B{batch_size}_E{epochs}",

                    config={
                    "architecture": model_name,
                    "dataset": "ILDC",
                    "learning_rate": lr,
                    "epochs": epochs,
                    "batch_size": batch_size,
                    "token_length": token_length,
                    "model_name": model_name
                    }
                )
                model_training(model_type, batch_size, model_name, lr, epochs, file_name, MODEL_CLASSES, token_length)
                
                wandb.finish()
