In [1]:
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs
!pip install sentencepiece
!pip install fugashi
!pip install unidic_lite
!pip install transformers
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

## import libraries

In [2]:
from datasets import Dataset
from datasets import load_dataset
import evaluate
from transformers import AutoModel,AutoTokenizer, Trainer, TrainingArguments ,EvalPrediction,DataCollatorWithPadding
import numpy as np
from itertools import chain
import re
from collections import Counter
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
from sklearn.model_selection import train_test_split
from transformers import pipeline
import torch
import matplotlib.pyplot as plt
from evaluate import load
import numpy as np
from transformers import EvalPrediction

In [64]:
# Define the task for zero-shot classification
task = "zero-shot-classification"

# Define the specific pre-trained model to be used
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"

# Define the directory where the output/results will be saved
output_dir = "/Bert-Contact-NLI"

# Clear the CUDA cache to free up GPU memory
torch.cuda.empty_cache()

# Create a pipeline object for zero-shot classification using the specified model and task
# Device = 0 for GPU, -1 for CPU
pipeline_object_before_training = pipeline(task,model_name,return_dict=True,device=0)

# Retrieve the model component from the pipeline object
model = pipeline_object_before_training.model

# Retrieve the tokenizer component from the pipeline object
tokenizer = pipeline_object_before_training.tokenizer

In [65]:
# Load the 'contractnli_a' configuration
df = load_dataset("kiddothe2b/contract-nli", "contractnli_a", split="train[:1000]")
df

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 1000
})

In [66]:
df = df.to_pandas()

In [67]:
df.head()

Unnamed: 0,premise,hypothesis,label
0,2.3 Provided that the Recipient has a written ...,Receiving Party shall not reverse engineer any...,2
1,5. All Confidential Information in any form an...,Receiving Party shall destroy or return some C...,1
2,4. Nothing in this Agreement is to be construe...,Agreement shall not grant Receiving Party any ...,1
3,11. The Recipient shall not advertise or other...,Receiving Party shall not disclose the fact th...,1
4,"1. “Confidential Information”, whenever used i...",Confidential Information shall only include te...,2


In [68]:
df.isnull().sum()

Unnamed: 0,0
premise,0
hypothesis,0
label,0


In [69]:
df.shape

(1000, 3)

In [70]:
df.drop_duplicates(inplace=True)

In [71]:
df.shape

(942, 3)

## Prepare our new tokens for the tokenizer

This step is crucial to ensure that our tokenizer can effectively tokenize most of the specialized vocabulary in our new domain.

In [72]:
# Convert the 'hypothesis' column in the DataFrame to string type
df['hypothesis'] = df['hypothesis'].astype("str")

# Convert the 'premise' column in DataFrame to string type
df['premise'] = df['premise'].astype("str")

In [73]:
# Get the 'hypothesis' column values as a list and concatenate with 'premise' column values as a list
sentences = df['hypothesis'].to_list() + df['premise'].to_list()

## Retrieve complete sentences from our dataset

## Find the words that are not in the tokenizer’s vocabulary

In [74]:
# Retrieve the keys (tokens/words) from the tokenizer's vocabulary
vocabulary = tokenizer.get_vocab().keys()

def get_new_tokens(sentences, vocabulary):
    # Create a set for faster lookup
    vocab_set = set(vocabulary)

    # Generator expression to yield words without ".", whitespace characters, or newline characters along with the pattern " 's "
    # You may use different cleaning methods in various languages
    cleaned_words = (re.sub(r"[.'\s\n]+|('\s)", "", word).lower().strip() for sentence in sentences for word in sentence.split())

    # Yield words that are not in the vocabulary set and are not empty
    return [word for word in cleaned_words if word not in vocab_set and word]

## Find the frequency of each word

In [75]:
# Define a function to count the occurrences of words in a list
def word_count(word_list):
  return Counter(word_list)

## Determine which new tokens to include

In [76]:
# Function to get new tokens from sentences that are not in the existing vocabulary
tokens_to_add = get_new_tokens(sentences,vocabulary)

# Count the occurrences of these new tokens
words = word_count(tokens_to_add)

# Initialize an empty list to store new tokens
new_tokens = []

# Loop through the words and their counts
for key,value in words.items():
  # Check if the count of the word is greater than 10 and if the length of the word is greater than 2
  if value > 10 and len(key)>2:
    # If conditions are met, add the word to the list of new tokens
    new_tokens.append(key)

In [77]:
new_tokens

['receiving',
 'engineer',
 'embody',
 'disclosing',
 'partys',
 'confidential',
 'disclose',
 'agreed',
 'negotiated',
 'expressly',
 'identified',
 'obligations',
 'survive',
 'independently',
 'retain',
 'destruction',
 'verbally',
 'conveyed',
 'solicit',
 'representatives',
 'third-parties',
 '(including',
 'consultants,',
 'advisors)',
 'circumstances',
 'notify',
 'law,',
 'acquire',
 'purposes',
 'stated',
 'persons',
 'them',
 'accordance',
 'agreement,',
 'to:',
 'discloser’s',
 'prior',
 'recipient’s',
 'employees,',
 'need',
 'information,',
 'it,',
 'that,',
 'means:',
 'copies',
 'thereof,',
 'disclosed',
 'returned',
 '(a)',
 'relationship',
 'months',
 'agreement;',
 'construed',
 'granting',
 'recipient,',
 'otherwise,',
 'whatsoever',
 'thereof',
 'otherwise',
 '“confidential',
 'mean',
 'data,',
 'delivered',
 'whatsoever,',
 'whether',
 'orally,',
 'visually',
 'disclosure',
 'designated',
 'terminate',
 'contemplated',
 'other,',
 'provided,',
 'however,',
 'restri

## Incorporate new tokens into the tokenizer and adjust the token embeddings accordingly

In [78]:
# Add new_tokens to the tokenizer's vocabulary
tokenizer.add_tokens(new_tokens)

# Resize the model's token embeddings to match the updated tokenizer's vocabulary size
model.resize_token_embeddings(len(tokenizer))

Embedding(250511, 768, padding_idx=0)

## Generate contradiction hypothesis

## First Option

Returning the hypothesis with high similarity to the premise, Generates hypothesis with shared words.

In [79]:
# Function to get a contradictory hypothesis
def get_contradictory_hypothesis_by_high_sim(premise, vectorizer, df, threshold=0.7, max_trials=100):
    premise_tfidf = vectorizer.transform([premise])
    closest_hypothesis = None
    closest_similarity = 1.0  # Set initial value to maximum similarity (opposite)

    # Try to find a suitable hypothesis within max_trials
    for _ in range(max_trials):
        random_index = random.randint(0, len(df) - 1)
        random_hypothesis = df.iloc[random_index]['hypothesis']
        random_tfidf = vectorizer.transform([random_hypothesis])

        similarity_with_premise = cosine_similarity(premise_tfidf, random_tfidf)[0][0]

        # Ensure the hypothesis contradicts the premise
        if similarity_with_premise < threshold:
            return random_hypothesis

        # Track the hypothesis with the closest similarity to the premise
        if similarity_with_premise < closest_similarity:
            closest_similarity = similarity_with_premise
            closest_hypothesis = random_hypothesis

    return closest_hypothesis  # Return the hypothesis with the lowest similarity to the premise

# Function to generate contradictory hypotheses
def generate_contradiction_hypothesis_by_high_sim(df):
    class_0_data = df[df['label'] == 0].copy()
    new_rows = []

    # Initialize a single vectorizer (fit once on both premise and hypothesis)
    vectorizer = TfidfVectorizer()
    combined_text = pd.concat([df['premise'], df['hypothesis']]).values  # Combine text data
    vectorizer.fit(combined_text)  # Fit on all text

    for index, row in class_0_data.iterrows():
        premise = row['premise']

        # Generate a contradiction hypothesis that has low similarity to the premise
        contradiction_hypothesis = get_contradictory_hypothesis_by_high_sim(
            premise, vectorizer, df
        )

        # Create a new row for class 2 and the contradiction hypothesis
        new_rows.append({'premise': premise, 'hypothesis': contradiction_hypothesis, 'label': 2})

    # Concatenate the new rows to the DataFrame
    new_df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    return new_df

## Second Option

Returning the hypothesis with low similarity to the premise, Generates only hard negative hypothesis (without any shared words).

In [80]:
def get_contradictory_hypothesis_by_low_sim(negative_hypothesis, vectorizer, tfidf_matrix, data_frame, threshold=0.9, max_trials=100):
    # Calculate TF-IDF for the negative hypothesis
    negative_tfidf = vectorizer.transform([negative_hypothesis])

    # Try to find a suitable hypothesis within max_trials
    for _ in range(max_trials):
        random_index = random.randint(0, len(data_frame) - 1)
        random_tfidf = vectorizer.transform([data_frame.iloc[random_index]['hypothesis']])
        random_similarity = cosine_similarity(negative_tfidf, random_tfidf)[0][0]
        if random_similarity < threshold:
            return data_frame.iloc[random_index]['hypothesis']
    return data_frame.iloc[random_index]['hypothesis']

def generate_contradiction_hypothesis_low_sim(df):
    class_0_data = df[df['label'] == 0].copy()
    new_rows = []
    for index, row in class_0_data.iterrows():
        premise = row['premise']
        negative_hypothesis = row['hypothesis']

        # Calculate TF-IDF vector for the negative hypothesis
        vectorizer = TfidfVectorizer()
        negative_tfidf = vectorizer.fit_transform([negative_hypothesis])

        # Generate a random hypothesis that doesn't share the same words as the negative hypothesis
        random_hypothesis = get_contradictory_hypothesis_by_low_sim(negative_hypothesis, vectorizer, negative_tfidf, df)

        # Create a new row for class 2 and the random hypothesis
        new_rows.append({'premise': premise, 'hypothesis': random_hypothesis, 'label': 2})

    # Concatenate the new rows to the DataFrame
    new_df = pd.concat([df,pd.DataFrame(new_rows)], ignore_index=True)
    return new_df

## Apply the random hypothesis to our dataset

In [81]:
# Generate contradiction hypothesis for every premise
df = generate_contradiction_hypothesis_by_high_sim(df)

## Select random records during training

This prevents our model from memorizing the dataset’s order, which is crucial for robust learning.

In [82]:
# Perform shuffling
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
    np.random.seed(42)  # Set a random seed for reproducibility
    for i in range(cycles):
        # Shuffle the rows of the DataFrame
        new_df = old_df.sample(frac=1).reset_index(drop=True)
        return new_df

## Generate the input sequence for the model

In [83]:
def create_input_sequence(sample):
  # Get text from the 'premise' column
  text = sample["premise"]

  # Get hypothesis from the 'hypothesis' column
  hypothesis = sample['hypothesis']

  # Get label from the 'class' column
  label = sample['label']

  # Encoding the sequence using the tokenizer
  encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length')

  # Assign label to the encoded sequence
  encoded_sequence['labels'] = label

  # Decode the input_ids
  encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)

  return encoded_sequence

## Split the data into training and testing sets

In [84]:
# Splitting the DataFrame 'df' into training and testing subsets
# Specifies that 30% of the data will be allocated to the test set
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)

## Prepare the dataset for training purposes

In [85]:
# Shuffle the train_data DataFrame and create a new DataFrame with shuffled rows
train_shuffle_df = shuffle_df(train_data)

# Shuffle the test_data DataFrame and create a new DataFrame with shuffled rows
test_shuffle_df = shuffle_df(test_data)

# Create a Dataset object from the shuffled train DataFrame
train = Dataset.from_pandas(train_shuffle_df)

# Create a Dataset object from the shuffled test DataFrame
test = Dataset.from_pandas(test_shuffle_df)

# Map the create_input_sequence function to the train and test datasets
# This function encodes the data, adds labels, and generates input sentences
train_dataset = train.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["label","premise"])
test_dataset = test.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["label","premise"])

Map:   0%|          | 0/753 [00:00<?, ? examples/s]

Map:   0%|          | 0/324 [00:00<?, ? examples/s]

In [86]:
train_dataset

Dataset({
    features: ['hypothesis', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'input_sentence'],
    num_rows: 753
})

In [87]:
test_dataset

Dataset({
    features: ['hypothesis', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'input_sentence'],
    num_rows: 324
})

## Login to Huggingface

In [88]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Training

In [98]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Create function to evaluate the model performance

In [89]:
def compute_metrics(p: EvalPrediction):
    # Extracting predictions from EvalPrediction object
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    # Obtaining the predicted classes
    preds = np.argmax(preds, axis=1)

    # Calculating the ratio of predictions equal to 2 (assumed label)
    ratio = np.mean(preds == 2)

    # Dictionary to store computed metrics
    result = {}

    # Loading evaluation metrics
    metric_f1 = load("f1")
    metric_precision = load("precision")
    metric_recall = load("recall")
    metric_acc = load("accuracy")

    # Computing various metrics
    result["accuracy"] = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    result["precision"] = metric_precision.compute(predictions=preds, references=p.label_ids, average='macro')["precision"]
    result["recall"] = metric_recall.compute(predictions=preds, references=p.label_ids, average='macro')["recall"]
    result["f1"] = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')["f1"]
    result["ratio"] = ratio

    return result

### Training Arguments

In [125]:
import torch
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,  # Reduced batch size
    num_train_epochs=10,
    weight_decay=0.01,
    # gradient_accumulation_steps=2,  # Gradient accumulation
    # fp16=True,  # Mixed precision training
    push_to_hub=True,
    report_to="none",
)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

In [126]:
trainer = Trainer(
  model=model,                     # The instantiated model to be trained
  args=training_args,              # Training arguments, defined above
  compute_metrics=compute_metrics, # A function to compute the metrics
  train_dataset=train_dataset,     # Training dataset
  eval_dataset=test_dataset,        # Evaluation dataset
  data_collator=data_collator,     # Data collator
)

In [127]:
trainer.evaluate()

{'eval_loss': 0.9601437449455261,
 'eval_model_preparation_time': 0.0061,
 'eval_accuracy': 0.6358024691358025,
 'eval_precision': 0.6154449951551401,
 'eval_recall': 0.6253985244140848,
 'eval_f1': 0.6160571376586012,
 'eval_ratio': 0.49691358024691357,
 'eval_runtime': 9.4627,
 'eval_samples_per_second': 34.24,
 'eval_steps_per_second': 4.333}

In [128]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,Precision,Recall,F1,Ratio
1,No log,1.005981,0.0061,0.675926,0.669363,0.717533,0.670498,0.283951
2,No log,0.831333,0.0061,0.728395,0.732509,0.739211,0.726572,0.324074
3,No log,0.949371,0.0061,0.716049,0.701751,0.704672,0.702027,0.398148
4,No log,0.944479,0.0061,0.70679,0.68732,0.753248,0.699346,0.364198
5,No log,1.125282,0.0061,0.694444,0.680048,0.693273,0.680938,0.348765
6,0.471100,1.342149,0.0061,0.728395,0.709094,0.754811,0.725048,0.419753
7,0.471100,1.441786,0.0061,0.731481,0.732322,0.716797,0.721811,0.388889
8,0.471100,1.479547,0.0061,0.737654,0.718458,0.757826,0.733078,0.407407
9,0.471100,1.469518,0.0061,0.728395,0.729115,0.71907,0.721176,0.37963
10,0.471100,1.491513,0.0061,0.731481,0.7092,0.73935,0.720859,0.404321


TrainOutput(global_step=950, training_loss=0.35364036158511514, metrics={'train_runtime': 1018.2954, 'train_samples_per_second': 7.395, 'train_steps_per_second': 0.933, 'total_flos': 1981279566489600.0, 'train_loss': 0.35364036158511514, 'epoch': 10.0})

In [129]:
trainer.evaluate()

{'eval_loss': 1.4915131330490112,
 'eval_model_preparation_time': 0.0061,
 'eval_accuracy': 0.7314814814814815,
 'eval_precision': 0.7092004719577168,
 'eval_recall': 0.7393500434148259,
 'eval_f1': 0.7208589335377923,
 'eval_ratio': 0.404320987654321,
 'eval_runtime': 10.8596,
 'eval_samples_per_second': 29.835,
 'eval_steps_per_second': 3.775,
 'epoch': 10.0}

In [130]:
# Save and push tokenizer along with the model
tokenizer.save_pretrained(output_dir)
model.save_pretrained(output_dir)

In [131]:
model.push_to_hub("osmanh/Bert-Contact-NLI")
tokenizer.push_to_hub("osmanh/Bert-Contact-NLI")

No files have been modified since last commit. Skipping to prevent empty commit.


tokenizer.json:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/osmanh/Bert-Contact-NLI/commit/1638739bba2bc64db97f5acd6590957fd6b34c37', commit_message='Upload tokenizer', commit_description='', oid='1638739bba2bc64db97f5acd6590957fd6b34c37', pr_url=None, repo_url=RepoUrl('https://huggingface.co/osmanh/Bert-Contact-NLI', endpoint='https://huggingface.co', repo_type='model', repo_id='osmanh/Bert-Contact-NLI'), pr_revision=None, pr_num=None)

## Switches the model to evaluation mode

In [39]:
model.eval()

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(250511, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

## prediction

In [132]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("osmanh/Bert-Contact-NLI")
model = AutoModelForSequenceClassification.from_pretrained("osmanh/Bert-Contact-NLI")

tokenizer_config.json:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [133]:
# Create new pipeline object with our finetuned model and tokenizer
pipeline_object_after_training = pipeline('zero-shot-classification',model=model,tokenizer=tokenizer,device=0)

In [134]:
# Text and label for evaluation before and after training
text = 'Shares of Hyundai Motor  jumped nearly 8% on Wednesday, a day after South Korea announced a "green new deal" to spur use of environmentally friendly vehicles.'
label = 'Shares that rise due to the "green new deal"'

# Evaluating pipeline performance before and after training
print("Results before training: ", pipeline_object_before_training(text,label))
print("Results after training: ", pipeline_object_after_training(text,label))

Results before training:  {'sequence': 'Shares of Hyundai Motor  jumped nearly 8% on Wednesday, a day after South Korea announced a "green new deal" to spur use of environmentally friendly vehicles.', 'labels': ['Shares that rise due to the "green new deal"'], 'scores': [0.987616240978241]}
Results after training:  {'sequence': 'Shares of Hyundai Motor  jumped nearly 8% on Wednesday, a day after South Korea announced a "green new deal" to spur use of environmentally friendly vehicles.', 'labels': ['Shares that rise due to the "green new deal"'], 'scores': [0.9239142537117004]}
