<a href="https://colab.research.google.com/github/malojan/nlp_nli/blob/main/demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install transformers
!pip install datasets==2.6
!pip install optuna==3.0
!pip install sentencepiece
!pip install protobuf



In [13]:
## Load general packages
# some more specialised packages are loaded in each sub section
import pandas as pd
import numpy as np

In [14]:
# set random seed for reproducibility
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

In [15]:
# import twitter data

df = pd.read_csv('twitter_sentiment_data.csv')

# Recode - 1 into 3
df['sentiment'] = df['sentiment'].replace(-1,3)

# Rename sentiment into label

df = df.rename(columns={'sentiment': 'label'})

# Rename message to text

df = df.rename(columns={'message': 'text'})
# Create a label_text column 

df['label_text'] = df['label'].replace({0: 'Climate: neutral', 1: 'Climate: believe', 2: 'Climate: news', 3: 'Climate: deny'})

# Split into train and test set

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=SEED_GLOBAL)

sample_size = 1000
df_train = df_train.sample(n=min(sample_size, len(df_train)), random_state=SEED_GLOBAL).copy(deep=True)
print("Length of training and test sets after sampling: ", len(df_train), " (train) ", len(df_test), " (test).")

Length of training and test sets after sampling:  1000  (train)  8789  (test).


In [16]:
print("Proportion of each class in train set: ")
print(df_train['label_text'].value_counts())

Proportion of each class in train set: 
label_text
Climate: believe    525
Climate: news       222
Climate: neutral    163
Climate: deny        90
Name: count, dtype: int64


In [17]:
# Creating hypothesis

hypothesis_label_dic = {
    "Climate: news" : "(News): the tweet links to factual news about climate change",
    "Climate: believe": "(Pro): the tweet supports the belief of man-made climate change",
    "Climate: deny": "The tweet does not believe in man-made climate change",
    "Climate: neutral": "Neutral: the tweet neither supports nor refutes the belief of man-made climate change"
}

In [18]:
## function for reformatting the train set
def format_nli_trainset(df_train=None, hypo_label_dic=None, random_seed=42):
  print(f"Length of df_train before formatting step: {len(df_train)}.")
  length_original_data_train = len(df_train)

  df_train_lst = []
  for label_text, hypothesis in hypo_label_dic.items():
    ## entailment
    df_train_step = df_train[df_train.label_text == label_text].copy(deep=True)
    df_train_step["hypothesis"] = [hypothesis] * len(df_train_step)
    df_train_step["label"] = [0] * len(df_train_step)
    ## not_entailment
    df_train_step_not_entail = df_train[df_train.label_text != label_text].copy(deep=True)
    df_train_step_not_entail = df_train_step_not_entail.sample(n=min(len(df_train_step), len(df_train_step_not_entail)), random_state=random_seed)
    df_train_step_not_entail["hypothesis"] = [hypothesis] * len(df_train_step_not_entail)
    df_train_step_not_entail["label"] = [1] * len(df_train_step_not_entail)
    # append
    df_train_lst.append(pd.concat([df_train_step, df_train_step_not_entail]))
  df_train = pd.concat(df_train_lst)
  
  # shuffle
  df_train = df_train.sample(frac=1, random_state=random_seed)
  df_train["label"] = df_train.label.apply(int)
  df_train["label_nli_explicit"] = ["True" if label == 0 else "Not-True" for label in df_train["label"]]  # adding this just to simplify readibility

  print(f"After adding not_entailment training examples, the training data was augmented to {len(df_train)} texts.")
  print(f"Max augmentation could be: len(df_train) * 2 = {length_original_data_train*2}. It can also be lower, if there are more entail examples than not-entail for a majority class.")

  return df_train.copy(deep=True)


df_train_formatted = format_nli_trainset(df_train=df_train, hypo_label_dic=hypothesis_label_dic, random_seed=SEED_GLOBAL)

Length of df_train before formatting step: 1000.
After adding not_entailment training examples, the training data was augmented to 1950 texts.
Max augmentation could be: len(df_train) * 2 = 2000. It can also be lower, if there are more entail examples than not-entail for a majority class.


In [19]:
df_train_formatted

Unnamed: 0,label,text,tweetid,label_text,hypothesis,label_nli_explicit
15840,1,The head of the EPA just made another dangerou...,841739967491645440,Climate: believe,The tweet does not believe in man-made climate...,Not-True
37108,1,RT @NASA_EO: New NOAA study refutes the notion...,608945384510078976,Climate: news,(Pro): the tweet supports the belief of man-ma...,Not-True
4359,1,RT @savetheredwoods: Ecologist Todd Dawson des...,797553946197753856,Climate: neutral,(Pro): the tweet supports the belief of man-ma...,Not-True
23947,1,RT @BraddJaffy: Rex Tillerson in focus as NY A...,882986402681688064,Climate: news,(Pro): the tweet supports the belief of man-ma...,Not-True
7701,1,Yet some say there is no global warming https:...,801990910955388930,Climate: believe,(News): the tweet links to factual news about ...,Not-True
...,...,...,...,...,...,...
17769,1,RT @EcoInternet3: Read President #Trump's exec...,846856160170926081,Climate: news,(Pro): the tweet supports the belief of man-ma...,Not-True
25719,1,The most effective individual steps to tackle ...,901789438237388801,Climate: news,(Pro): the tweet supports the belief of man-ma...,Not-True
5913,0,RT @SenSanders: We have a president-elect who ...,798768783523360769,Climate: believe,(Pro): the tweet supports the belief of man-ma...,True
39551,0,RT @MinnDad: @FredZeppelin12 Man made global w...,671487584091160576,Climate: deny,The tweet does not believe in man-made climate...,True


In [20]:
## function for reformatting the test set
def format_nli_testset(df_test=None, hypo_label_dic=None):
  ## explode test dataset for N hypotheses
  hypothesis_lst = [value for key, value in hypo_label_dic.items()]
  print("Number of hypotheses/classes: ", len(hypothesis_lst))

  # label lists with 0 at alphabetical position of their true hypo, 1 for not-true hypos
  label_text_label_dic_explode = {}
  for key, value in hypo_label_dic.items():
    label_lst = [0 if value == hypo else 1 for hypo in hypothesis_lst]
    label_text_label_dic_explode[key] = label_lst

  df_test["label"] = df_test.label_text.map(label_text_label_dic_explode)
  df_test["hypothesis"] = [hypothesis_lst] * len(df_test)
  print(f"Original test set size: {len(df_test)}")
  
  # explode dataset to have K-1 additional rows with not_entail label and K-1 other hypotheses
  # ! after exploding, cannot sample anymore, because distorts the order to true label values, which needs to be preserved for evaluation code
  df_test = df_test.explode(["hypothesis", "label"])  # multi-column explode requires pd.__version__ >= '1.3.0'
  print(f"Test set size for NLI classification: {len(df_test)}\n")

  df_test["label_nli_explicit"] = ["True" if label == 0 else "Not-True" for label in df_test["label"]]  # adding this just to simplify readibility

  return df_test.copy(deep=True)


df_test_formatted = format_nli_testset(df_test=df_test, hypo_label_dic=hypothesis_label_dic)
df_test_formatted

Number of hypotheses/classes:  4
Original test set size: 8789
Test set size for NLI classification: 35156



Unnamed: 0,label,text,tweetid,label_text,hypothesis,label_nli_explicit
34461,1,RT @vincecable: Nice summary. Add climate chan...,955713180684177408,Climate: neutral,(News): the tweet links to factual news about ...,Not-True
34461,1,RT @vincecable: Nice summary. Add climate chan...,955713180684177408,Climate: neutral,(Pro): the tweet supports the belief of man-ma...,Not-True
34461,1,RT @vincecable: Nice summary. Add climate chan...,955713180684177408,Climate: neutral,The tweet does not believe in man-made climate...,Not-True
34461,0,RT @vincecable: Nice summary. Add climate chan...,955713180684177408,Climate: neutral,Neutral: the tweet neither supports nor refute...,True
20916,0,RT @CNN: Former US President Obama will speak ...,861896636313817089,Climate: news,(News): the tweet links to factual news about ...,True
...,...,...,...,...,...,...
5638,1,RT @StephenSchlegel: she's thinking about how ...,798618059003035653,Climate: believe,Neutral: the tweet neither supports nor refute...,Not-True
17989,0,Exxon to Trump: Don't ditch Paris climate chan...,847226330361937921,Climate: news,(News): the tweet links to factual news about ...,True
17989,1,Exxon to Trump: Don't ditch Paris climate chan...,847226330361937921,Climate: news,(Pro): the tweet supports the belief of man-ma...,Not-True
17989,1,Exxon to Trump: Don't ditch Paris climate chan...,847226330361937921,Climate: news,The tweet does not believe in man-made climate...,Not-True


In [21]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

## load the BERT-NLI model and its tokenizer
# you can choose any of the NLI models here: https://huggingface.co/MoritzLaurer
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c"  # English model: "MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c"; multilingual model: "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# If torch available, add cuda as device, if not add mps if available, if not add cpu

# check if GPU or MPS is available, else use CPU
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
device = torch.device(device)


print(f"Device: {device}")
model.to(device);




Device: mps


In [22]:
# convert pandas dataframes to Hugging Face dataset object to facilitate pre-processing
import datasets

dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(df_train_formatted),
    "test": datasets.Dataset.from_pandas(df_test_formatted)
})


In [23]:
# tokenize
def tokenize_nli_format(examples):
  return tokenizer(examples["text"], examples["hypothesis"], truncation=True, max_length=512)  # max_length can be reduced to e.g. 256 to increase speed, but long texts will be cut off
dataset["train"] = dataset["train"].map(tokenize_nli_format, batched=True)  
dataset["test"] = dataset["test"].map(tokenize_nli_format, batched=True) 


100%|██████████| 2/2 [00:00<00:00, 18.17ba/s]
100%|██████████| 36/36 [00:01<00:00, 23.94ba/s]


In [24]:
print("The overall structure of the pre-processed train and test sets:\n")
print(dataset)

print("\n\nAn example for a tokenized hypothesis-context pair:\n")
print(dataset["train"][0])

The overall structure of the pre-processed train and test sets:

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'tweetid', 'label_text', 'hypothesis', 'label_nli_explicit', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1950
    })
    test: Dataset({
        features: ['label', 'text', 'tweetid', 'label_text', 'hypothesis', 'label_nli_explicit', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 35156
    })
})


An example for a tokenized hypothesis-context pair:

{'label': 1, 'text': 'The head of the EPA just made another dangerous comment about global warming https://t.co/Q1FahdFe3F', 'tweetid': 841739967491645440, 'label_text': 'Climate: believe', 'hypothesis': 'The tweet does not believe in man-made climate change', 'label_nli_explicit': 'Not-True', '__index_level_0__': 15840, 'input_ids': [1, 279, 761, 265, 262, 9388, 348, 412, 501, 3051, 1714, 314, 1307, 6965, 3597, 294, 320,

In [25]:
from transformers import TrainingArguments, Trainer, logging
import torch
# Set the directory to write the fine-tuned model and training logs to.
# With google colab, this will create a temporary folder, which will be deleted once you disconnect. 
# You can connect to your personal google drive to save models and logs properly.
training_directory = "BERT-nli-demo"

# FP16 is a hyperparameter which can increase training speed and reduce memory consumption, but only on GPU and if batch-size > 8, see here: https://huggingface.co/transformers/performance.html?#fp16
# FP16 does not work on CPU or for multilingual mDeBERTa models
fp16_bool = True if torch.cuda.is_available() else False
if "mdeberta" in model_name.lower(): fp16_bool = False  # multilingual mDeBERTa does not support FP16 yet: https://github.com/microsoft/DeBERTa/issues/77
# in case of hyperparameter search end the end: FP16 has to be set to False. The integrated hyperparameter search with the Hugging Face Trainer can lead to errors otherwise. 
fp16_bool = False

# Hugging Face tipps to increase training speed and decrease out-of-memory (OOM) issues: https://huggingface.co/transformers/performance.html?
# Overview of all training arguments: https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments
train_args = TrainingArguments(
    output_dir=f'./results/{training_directory}',
    logging_dir=f'./logs/{training_directory}',
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # if you get an out-of-memory error, reduce this value to 8 or 4 and restart the runtime. Higher values increase training speed, but also increase memory requirements. Ideal values here are always a multiple of 8.
    per_device_eval_batch_size=80,  # if you get an out-of-memory error, reduce this value, e.g. to 40 and restart the runtime
    #gradient_accumulation_steps=4, # Can be used in case of memory problems to reduce effective batch size. accumulates gradients over X steps, only then backward/update. decreases memory usage, but also slightly speed. (!adapt/halve batch size accordingly)
    num_train_epochs=3,  # this can be increased, but higher values increase training time. Good values for NLI are between 3 and 20.
    warmup_ratio=0.25,  # a good normal default value is 0.06 for normal BERT-base models, but since we want to reuse prior NLI knowledge and avoid catastrophic forgetting, we set the value higher
    weight_decay=0.1,
    seed=SEED_GLOBAL,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=fp16_bool,  # Can speed up training and reduce memory consumption, but only makes sense at batch-size > 8. loads two copies of model weights, which creates overhead. https://huggingface.co/transformers/performance.html?#fp16
    fp16_full_eval=fp16_bool,
    evaluation_strategy="no", # options: "no"/"steps"/"epoch"
    #eval_steps=10_000,  # evaluate after n steps if evaluation_strategy!='steps'. defaults to logging_steps
    save_strategy = "no",  # options: "no"/"steps"/"epoch"
    #save_steps=10_000,              # Number of updates steps before two checkpoint saves.
    #save_total_limit=10,             # If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir
    #logging_strategy="steps",
    report_to="all",  # "all"  # logging
    #push_to_hub=False,
    #push_to_hub_model_id=f"{model_name}-finetuned-{task}",
)

# helper function to clean memory and reduce risk of out-of-memory error
import gc
def clean_memory():
  #del(model)
  if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
  gc.collect()

clean_memory()

from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report

def compute_metrics_nli_binary(eval_pred, label_text_alphabetical=None):
    predictions, labels = eval_pred

    ### reformat model output to enable calculation of standard metrics
    # split in chunks with predictions for each hypothesis for one unique premise
    def chunks(lst, n):  # Yield successive n-sized chunks from lst. https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

    # for each chunk/premise, select the most likely hypothesis
    softmax = torch.nn.Softmax(dim=1)
    prediction_chunks_lst = list(chunks(predictions, len(set(label_text_alphabetical)) ))
    hypo_position_highest_prob = []
    for i, chunk in enumerate(prediction_chunks_lst):
        hypo_position_highest_prob.append(np.argmax(np.array(chunk)[:, 0]))  # only accesses the first column of the array, i.e. the entailment/true prediction logit of all hypos and takes the highest one

    label_chunks_lst = list(chunks(labels, len(set(label_text_alphabetical)) ))
    label_position_gold = []
    for chunk in label_chunks_lst:
        label_position_gold.append(np.argmin(chunk))  # argmin to detect the position of the 0 among the 1s

    print("Highest probability prediction per premise: ", hypo_position_highest_prob)
    print("Correct label per premise: ", label_position_gold)

    ### calculate standard metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(label_position_gold, hypo_position_highest_prob, average='macro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(label_position_gold, hypo_position_highest_prob, average='micro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    acc_balanced = balanced_accuracy_score(label_position_gold, hypo_position_highest_prob)
    acc_not_balanced = accuracy_score(label_position_gold, hypo_position_highest_prob)
    metrics = {'f1_macro': f1_macro,
               'f1_micro': f1_micro,
               'accuracy_balanced': acc_balanced,
               'accuracy_not_b': acc_not_balanced,
               #'precision_macro': precision_macro,
               #'recall_macro': recall_macro,
               #'precision_micro': precision_micro,
               #'recall_micro': recall_micro,
               #'label_gold_raw': label_position_gold,
               #'label_predicted_raw': hypo_position_highest_prob
               }
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} )  # print metrics but without label lists
    print("Detailed metrics: ", classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, sample_weight=None, digits=2, output_dict=True,
                                zero_division='warn'), "\n")
    return metrics

# Create alphabetically ordered list of the original dataset classes/labels 
# This is necessary to be sure that the ordering of the test set labels and predictions is the same. Otherwise there is a risk that labels and predictions are in a different order and resulting metrics are wrong.
label_text_alphabetical = np.sort(df_train.label_text.unique())  


In [26]:
# training
trainer = Trainer( 
    model=model,
    tokenizer=tokenizer,
    args=train_args,
    train_dataset=dataset["train"],  #.shard(index=1, num_shards=100),  # could shard data for faster testing https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
    eval_dataset=dataset["test"],  #.shard(index=1, num_shards=100),  
    compute_metrics=lambda eval_pred: compute_metrics_nli_binary(eval_pred, label_text_alphabetical=label_text_alphabetical)  
)

trainer.train()


  0%|          | 0/366 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 14%|█▎        | 50/366 [02:01<17:06,  3.25s/it]

RuntimeError: MPS backend out of memory (MPS allocated: 4.11 GB, other allocations: 14.18 GB, max allowed: 18.13 GB). Tried to allocate 256 bytes on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
## Evaluate the fine-tuned model on the held-out test set
results = trainer.evaluate()## Evaluate the fine-tuned model on the held-out test set


In [None]:
## Evaluate the fine-tuned model on the held-out test set
print(results)
