<a href="https://colab.research.google.com/github/malojan/nlp_nli/blob/main/demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[sentencepiece]==4.23
!pip install datasets==2.6
!pip install optuna==3.0

zsh:1: no matches found: transformers[sentencepiece]==4.23


Collecting datasets==2.6
  Downloading datasets-2.6.0-py3-none-any.whl (441 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m441.5/441.5 kB[0m [31m868.1 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting numpy>=1.17 (from datasets==2.6)
  Downloading numpy-1.26.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.1/115.1 kB[0m [31m863.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyarrow>=6.0.0 (from datasets==2.6)
  Downloading pyarrow-14.0.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.0 kB)
Collecting dill<0.3.6 (from datasets==2.6)
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.8/95.8 kB[0m [31m794.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pandas (from datasets==2.6)
  Downloading pandas-2.1.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting request

In [3]:
## Load general packages
# some more specialised packages are loaded in each sub section
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [2]:
# set random seed for reproducibility
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

In [38]:
# import twitter data

df = pd.read_csv('twitter_sentiment_data.csv')

In [39]:
# Recode - 1 into 3
df['sentiment'] = df['sentiment'].replace(-1,3)

# Rename sentiment into label

df = df.rename(columns={'sentiment': 'label'})

# Create a label_text column 

df['label_text'] = df['label'].replace({0: 'Climate: neutral', 1: 'Climate: believe', 2: 'Climate: news', 3: 'Climate: deny'})

# Split into train and test set

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=SEED_GLOBAL)

sample_size = 1000
df_train = df_train.sample(n=min(sample_size, len(df_train)), random_state=SEED_GLOBAL).copy(deep=True)
print("Length of training and test sets after sampling: ", len(df_train), " (train) ", len(df_test), " (test).")

Length of training and test sets after sampling:  1000  (train)  8789  (test).


In [41]:
print("Proportion of each class in train set: ")
print(df_train['label_text'].value_counts())

Proportion of each class in train set: 
Climate: believe    525
Climate: news       222
Climate: neutral    163
Climate: deny        90
Name: label_text, dtype: int64


In [20]:
# Creating hypothesis

hypothesis_label_dic = {
    "Climate: news" : "(News): the tweet links to factual news about climate change",
    "Climate: believe": "(Pro): the tweet supports the belief of man-made climate change",
    "Climate: deny": "The tweet does not believe in man-made climate change",
    "Climate: neutral": "Neutral: the tweet neither supports nor refutes the belief of man-made climate change"
}

In [43]:
## function for reformatting the train set
def format_nli_trainset(df_train=None, hypo_label_dic=None, random_seed=42):
  print(f"Length of df_train before formatting step: {len(df_train)}.")
  length_original_data_train = len(df_train)

  df_train_lst = []
  for label_text, hypothesis in hypo_label_dic.items():
    ## entailment
    df_train_step = df_train[df_train.label_text == label_text].copy(deep=True)
    df_train_step["hypothesis"] = [hypothesis] * len(df_train_step)
    df_train_step["label"] = [0] * len(df_train_step)
    ## not_entailment
    df_train_step_not_entail = df_train[df_train.label_text != label_text].copy(deep=True)
    df_train_step_not_entail = df_train_step_not_entail.sample(n=min(len(df_train_step), len(df_train_step_not_entail)), random_state=random_seed)
    df_train_step_not_entail["hypothesis"] = [hypothesis] * len(df_train_step_not_entail)
    df_train_step_not_entail["label"] = [1] * len(df_train_step_not_entail)
    # append
    df_train_lst.append(pd.concat([df_train_step, df_train_step_not_entail]))
  df_train = pd.concat(df_train_lst)
  
  # shuffle
  df_train = df_train.sample(frac=1, random_state=random_seed)
  df_train["label"] = df_train.label.apply(int)
  df_train["label_nli_explicit"] = ["True" if label == 0 else "Not-True" for label in df_train["label"]]  # adding this just to simplify readibility

  print(f"After adding not_entailment training examples, the training data was augmented to {len(df_train)} texts.")
  print(f"Max augmentation could be: len(df_train) * 2 = {length_original_data_train*2}. It can also be lower, if there are more entail examples than not-entail for a majority class.")

  return df_train.copy(deep=True)


df_train_formatted = format_nli_trainset(df_train=df_train, hypo_label_dic=hypothesis_label_dic, random_seed=SEED_GLOBAL)

Length of df_train before formatting step: 1000.
After adding not_entailment training examples, the training data was augmented to 1506 texts.
Max augmentation could be: len(df_train) * 2 = 2000. It can also be lower, if there are more entail examples than not-entail for a majority class.


In [44]:
df_train_formatted

Unnamed: 0,label,message,tweetid,label_text,hypothesis,label_nli_explicit
5407,1,RT @RadioPakistan: Pakistan and Iran agree to ...,798478305146114048,Climate: news,(Pro): the tweet supports the belief of man-ma...,Not-True
10770,1,World heat shatters records in 2016 in new sig...,817951192428986368,Climate: news,The tweet does not believe in man-made climate...,Not-True
5257,0,RT @SenSanders: We have a president-elect who ...,798338061662720000,Climate: believe,(Pro): the tweet supports the belief of man-ma...,True
31637,1,RT @floodsg: Important response from @campaign...,959551491756445698,Climate: neutral,(Pro): the tweet supports the belief of man-ma...,Not-True
24481,1,Scientists are getting better at linking extre...,887320023227920384,Climate: news,(Pro): the tweet supports the belief of man-ma...,Not-True
...,...,...,...,...,...,...
27628,1,RT @NOAAResearch: A new study highlights the r...,915584592584826880,Climate: news,The tweet does not believe in man-made climate...,Not-True
21202,0,I thought you'd like to know that pasta in mys...,865740085827862528,Climate: neutral,Neutral: the tweet neither supports nor refute...,True
1601,1,How a new money system could help stop climate...,794888626576429056,Climate: news,(Pro): the tweet supports the belief of man-ma...,Not-True
8911,1,Reading: Guardian On climate change and the ec...,807694118646382592,Climate: believe,Neutral: the tweet neither supports nor refute...,Not-True


In [46]:
## function for reformatting the test set
def format_nli_testset(df_test=None, hypo_label_dic=None):
  ## explode test dataset for N hypotheses
  hypothesis_lst = [value for key, value in hypo_label_dic.items()]
  print("Number of hypotheses/classes: ", len(hypothesis_lst))

  # label lists with 0 at alphabetical position of their true hypo, 1 for not-true hypos
  label_text_label_dic_explode = {}
  for key, value in hypo_label_dic.items():
    label_lst = [0 if value == hypo else 1 for hypo in hypothesis_lst]
    label_text_label_dic_explode[key] = label_lst

  df_test["label"] = df_test.label_text.map(label_text_label_dic_explode)
  df_test["hypothesis"] = [hypothesis_lst] * len(df_test)
  print(f"Original test set size: {len(df_test)}")
  


In [47]:
  # explode dataset to have K-1 additional rows with not_entail label and K-1 other hypotheses
  # ! after exploding, cannot sample anymore, because distorts the order to true label values, which needs to be preserved for evaluation code
  df_test = df_test.explode(["hypothesis", "label"])  # multi-column explode requires pd.__version__ >= '1.3.0'
  print(f"Test set size for NLI classification: {len(df_test)}\n")


ValueError: columns must have matching element counts

In [None]:

  df_test["label_nli_explicit"] = ["True" if label == 0 else "Not-True" for label in df_test["label"]]  # adding this just to simplify readibility

  return df_test.copy(deep=True)


df_test_formatted = format_nli_testset(df_test=df_test, hypo_label_dic=hypothesis_label_dic)