In [2]:
# Data analysis
from typing import Dict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# Modelling
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_class_weight
import transformers
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import (ElectraForSequenceClassification,
                          ElectraTokenizerFast, EvalPrediction, InputFeatures,
                          Trainer, TrainingArguments, glue_compute_metrics, pipeline,
                            AutoTokenizer, AutoModelForSequenceClassification)


# Supressing warnings
import warnings
warnings.filterwarnings("ignore")


# Metrics
from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score, accuracy_score
from transformers.data.metrics import glue_compute_metrics
from scipy.stats import spearmanr


# Other imports
import os
import random
import gc
import pickle


# Logging into weights and biases
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WB_token")
wandb.login(key=secret_value_0)


# Methods for setting random seed
def set_seeds(seed=123):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    

# Printing version of the transformer library
transformers.__version__

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


'4.45.1'

In [None]:
# Global variables
task = "eraser_movie"

In [3]:
# Loading DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)

# Setting up device for DistilBERT training
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device_cpu = torch.device('cpu')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
# class used to store datasets in this project (required for object loading from pickle)
class TrainerDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, evidences=None):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.evidences=evidences

        # Tokenize the input
        self.tokenized_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")   

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return InputFeatures(
            input_ids=self.tokenized_inputs['input_ids'][idx],
#             token_type_ids=self.tokenized_inputs['token_type_ids'][idx],
            attention_mask=self.tokenized_inputs['attention_mask'][idx],
            label=self.targets[idx])   

In [6]:
# Set seed for reproducibility
np.random.seed(123)


# Method used to calculate all discriminatory power metrics (AUC, f1, Accuracy, ...) and spearman correlation
def calculate_metric_values(predictions, label_ids) -> Dict:

    # Getting predicted target variable from probas
    preds = np.argmax(predictions, axis=1)

    # Compute GLUE task metrics
    task_metrics = glue_compute_metrics(
        task_name="sst-2",
        preds=preds,
        labels=label_ids
    )
    
    # Compute additional metrics
    accuracy = accuracy_score(label_ids, preds)
    f1 = f1_score(label_ids, preds, average="binary")  # Binary classification
    precision = precision_score(label_ids, preds, average="binary")
    recall = recall_score(label_ids, preds, average="binary")
    mcc = matthews_corrcoef(label_ids, preds)
    
    # Compute AUC
    # For AUC, we need the raw predictions (probabilities). Assuming predictions are logits:
    if len(predictions.shape) > 1:  # Check if predictions are logits
        probs = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)
        auc = roc_auc_score(label_ids, probs[:, 1])  # Assuming binary classification
    else:
        auc = None  # AUC cannot be calculated without probabilities


    # Check calibration
    df = pd.DataFrame(data={"pred":np.transpose(predictions)[1], "real":label_ids })
    df["centile"] = 0
    for i in range(1,10):
        less_range = np.quantile(df.pred,i/10)
        df["centile"] = [df.centile[j]+1 if df["pred"][j]>less_range else df.centile[j] for j in range(len(df.centile))]
    stats_for_bucket = df.groupby('centile').mean()
    correlation, p_value = spearmanr(stats_for_bucket["pred"], stats_for_bucket["real"])
    
    # Add custom metrics to the output dictionary
    metrics =  {
        "acc": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "mcc": mcc,
        "auc": auc,
        "spearman": correlation,
        "spearman_pval": p_value
    }
    task_metrics.update(metrics)

    return task_metrics

# Wrapper of the above method for DistilBERT
def compute_metrics(p: EvalPrediction) -> Dict:
    return calculate_metric_values(p.predictions, p.label_ids)
    

# DistilBERT

In [7]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Method used to train model using given train dataset and evaluates it on eval_dataset
def train_model(train_dataset, eval_dataset, output_dir, model_name = "DistilBERT"):
    set_seeds(123)

    if model_name == "DistilBERT":
        # Setting up training arguments
        tokenizer = None
        training_args = TrainingArguments(
            output_dir= output_dir,
            num_train_epochs=2, 
            overwrite_output_dir=True,
            do_train=True,
            do_eval=True,
            per_device_train_batch_size=16,    
            dataloader_drop_last=True,  # Make sure all batches are of equal size
        )

        # Initializing the model
        model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2, ignore_mismatched_sizes=True)
        model.to(device)

        # Setting up trainer with weighted loss
        k, v = np.unique(train_dataset.targets, return_counts=True)
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            class_weights = list((v/np.sum(v)).astype(np.float32)))

        # Model training and evaluation
        trainer.train()
        model_result = trainer.evaluate()
        print(f"{model_name} accuracy: {model_result['eval_acc']}")
        

    
    elif model_name == "LSTM":
        # Hyperparameters 
        vocab_size = 10000
        max_sequence_length = 300
        embedding_dim = 100
        lstm_units = 128
        dropout_rate = 0.5
        num_classes = 2

        # Tokenization
        tokenizer = Tokenizer(num_words=vocab_size)
        tokenizer.fit_on_texts(train_dataset.inputs)
        sequences = tokenizer.texts_to_sequences(train_dataset.inputs)
        sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

        # Model definition
        model = Sequential()
        model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
        model.add(LSTM(units=lstm_units, return_sequences=False))
        model.add(Dropout(rate=dropout_rate))
        model.add(Dense(units=1, activation='sigmoid'))

        # class_weights = compute_class_weight(
        #     class_weight='balanced',
        #     classes=np.unique(np.array(train_dataset.targets)),
        #     y=np.array(train_dataset.targets)
        # )
        # class_weights = {i: class_weights[i] for i in range(len(class_weights))}
        
        # Compile the model
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        # Train the model
        model.fit(sequences, np.array(
            train_dataset.targets),
                  batch_size=32, 
                  epochs=5, 
                  # class_weight=class_weights
                 )

        # Evaluate the model
        tokenized_eval_dataset = pad_sequences(tokenizer.texts_to_sequences(eval_dataset.inputs))
        pred = model.predict(tokenized_eval_dataset)
        pred = np.array([[1-x[0], x[0]] for x in pred])
        model_result = calculate_metric_values(pred, eval_dataset.targets)
        print(f"{model_name} accuracy: {model_result['acc']}")

    elif model_name=="SVM":
        # Tokenization
        tokenizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
        X = tokenizer.fit_transform(train_dataset.inputs)

        # Model training
        model = SVC(kernel='rbf', C=1.0, probability=True, class_weight = "balanced")
        model.fit(X, train_dataset.targets)

        # Evaluating the results
        pred = model.predict_proba(tokenizer.transform(eval_dataset.inputs))
        model_result = calculate_metric_values(pred, eval_dataset.targets)
        print(f"{model_name} accuracy: {model_result['acc']}")
    
    return model, model_result, tokenizer





In [9]:
class CustomTrainer(Trainer):
    
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        # move weights to correct device
        device = logits.device
        weight = torch.tensor(self.class_weights, device=device)
        
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss




models = {}
set_seeds(123)

# Loading original train and original eval datasets before imbalancing
with open(f"/kaggle/input/balanced-datasets/{task}_train.obj", 'rb') as pickle_file:
    train_dataset = pickle.load(pickle_file)
with open(f"/kaggle/input/balanced-datasets/{task}_eval.obj", 'rb') as pickle_file:
    eval_dataset = pickle.load(pickle_file)
    

# Training baseline models on original data before imbalancing
for model_name in ["DistilBERT"]:
# for model_name in ["LSTM", "SVM"]:
    # Training model
    model, models[f"{task}_100_original_0*{model_name}"], tok = train_model(train_dataset, eval_dataset, f"models/{task}/", model_name)
    # Saving model
    filehandler = open(f"{task}_100_original_0*{model_name}"+".obj","wb")
    pickle.dump(model,filehandler)
    filehandler.close()

    if model_name != "DistilBERT":
        # Saving tokenizer
        filehandler = open("tok_"+f"{task}_100_original_0*{model_name}"+".obj","wb")
        pickle.dump(tok,filehandler)
        filehandler.close()



In [10]:
skip = 0   # Some codes had to be run multiple times this value tells how many datasets should be skipped before model training
model_count = len(os.listdir(f"/kaggle/input/paraphrase-dataset-final/"))-skip

for model_name in ["DistilBERT"]:
# for model_name in ["LSTM", "SVM"]:
    for task in ["eraser_movie"]:
        for file in os.listdir(f"/kaggle/input/paraphrase-dataset-final/")[skip:skip+model_count]:
            # Printing the name of dataset the model is being trained on
            print(file)

            # Loading training data, training the model, and evaluating the results
            with open("/kaggle/input/paraphrase-dataset-final/"+file, 'rb') as pickle_file:
                model, pred, tok = train_model(pickle.load(pickle_file), eval_dataset, f"models/{task}/", model_name)

                # Correcting some file names
                if "paraphrase" in file:
                    parts = file.split("paraphrase")
                    file = parts[0]+"paraphrase"+parts[1][3:]
                    
                # Saving model
                filehandler = open(file.split(".obj")[0]+ "*"+model_name+".obj","wb")
                pickle.dump(model,filehandler)
                filehandler.close()

                if model_name != "DistilBERT":
                    # Saving tokenizer
                    filehandler = open("tok_"+file.split(".obj")[0]+ "*"+model_name+".obj","wb")
                    pickle.dump(tok,filehandler)
                    filehandler.close()
                
                models[file.split(".obj")[0]+ "*"+model_name] = pred
                gc.collect()



eraser_movie_20_paraphrase_10_0.obj
Epoch 1/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.6179 - loss: 0.6894
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.7771 - loss: 0.5161
Epoch 3/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9087 - loss: 0.2199
Epoch 4/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.8284 - loss: 0.4595
Epoch 5/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9208 - loss: 0.2464
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 83ms/step
LSTM accuracy: 0.55
eraser_movie_10_paraphrase_10_1.obj
Epoch 1/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.6452 - loss: 0.6691
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9473 - loss: 0.2186
Epoch 3/5


In [11]:

# Number of trained models
len(list(models.items()))

30

In [12]:
# Method used to decode train dataset names and extracting balancing method and imbalance percentage
def get_method_and_percentage(name):
    string = "_".join(name.split("_")[:-1])
    percentage = ""
    perc = 0
    for i in range(len(string)):
        if string[i].isdigit():
            perc = i
            percentage += string[i]
    perc+=2 if len(percentage)>0 else 0
    method = string[perc:]
    return [method, percentage]

In [13]:
# GGathering the results and converting them to .cssv file
df = pd.DataFrame(list(models.items()))
res = pd.DataFrame(list(df[1]))
name = pd.DataFrame(list(df[0].map(lambda x:get_method_and_percentage(x)+[x.split("*")[1], x.split("_")[-1].split("*")[0]])))
name.columns = ["Balancing method", "Imbalance %", "Model name", "Model ID"]
df = pd.concat([name, res], axis=1)
df = df.sort_values(["Imbalance %", "Model ID", "Balancing method", "Model name"])
df.columns = [str(col).replace("eval_", "") for col in df.columns]
df.to_csv("nlpaug_paraphrasers_distilbert.csv")
df

Unnamed: 0,Balancing method,Imbalance %,Model name,Model ID,acc,f1,precision,recall,mcc,auc,spearman,spearman_pval
13,,1010,LSTM,0,0.525,0.227642,0.608696,0.14,0.078365,0.6036,0.602671,0.06516811
28,,1010,SVM,0,0.505,0.019802,1.0,0.01,0.070888,0.724,0.90303,0.0003436122
1,,1010,LSTM,1,0.5,0.090909,0.5,0.05,0.0,0.681,0.87395,0.0009460263
16,,1010,SVM,1,0.5,0.019608,0.5,0.01,0.0,0.7324,0.843177,0.002180016
11,,1010,LSTM,2,0.5,0.0,0.0,0.0,0.0,0.6201,0.725623,0.01752806
26,,1010,SVM,2,0.505,0.019802,1.0,0.01,0.070888,0.6483,0.789024,0.006660472
10,,1010,LSTM,3,0.485,0.019048,0.2,0.01,-0.096077,0.5623,0.594567,0.06985135
25,,1010,SVM,3,0.505,0.019802,1.0,0.01,0.070888,0.6707,0.781818,0.007547008
4,,1010,LSTM,4,0.5,0.0,0.0,0.0,0.0,0.6174,0.640043,0.04623588
19,,1010,SVM,4,0.51,0.039216,1.0,0.02,0.100504,0.7057,0.887542,0.000609666
