In [13]:
!pip install datasets
!pip install transformers
!pip install -q -U trl accelerate git+https://github.com/huggingface/peft.git
!pip install -q bitsandbytes einops sentencepiece
!pip uninstall -y apex

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m

In [14]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer, pipeline, TextClassificationPipeline, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import shap
import pandas as pd
from sklearn.model_selection import train_test_split
from itertools import combinations
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
from tqdm import tqdm
from sklearn.metrics import classification_report
import re
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from transformers import set_seed
from sklearn.model_selection import KFold
import os
import json


In [15]:
flag = 7
flag2 = 1
fold_index = 1
seed = 42
learning_rate = 2e-5
batch_size = 2
epochs = 1
metric_name = "macro_f1"


In [16]:
set_seed(seed)

In [17]:
if(flag == 1):

    model_name = "bert-base-uncased"
    folder = "/content/bert/"

elif(flag == 2):

    model_name = "nlpaueb/legal-bert-base-uncased"
    folder = "/content/legal-bert/"

elif(flag == 3):

    model_name = "law-ai/InCaseLawBERT"
    folder = "/content/case-bert/"

elif(flag == 4):

     model_name = "FacebookAI/roberta-base"
     folder = "/content/roberta/"

elif(flag == 5):

    model_name = "microsoft/deberta-base"
    folder = "/content/deberta/"



elif(flag == 6):

    model_name = "allenai/longformer-base-4096"
    folder = "/content/longformer/"

elif(flag == 7):

    model_name = "google/bigbird-roberta-large"
    folder = "/content/bigbird/"




if(flag >= 6):

    max_length = 4096

else:

    max_length = 512







In [18]:
labels_names  = ['Conditional Clauses', 'Cross-Dependent Clauses','Legal Terminology', 'Ambiguity in Expression', 'Interpretable']

id2label = {}
label2id = {}
for i, j in enumerate(labels_names):

    id2label[i] = j
    label2id[j] = i

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/969 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

In [20]:
def preprocess_data(examples):

  encoding = tokenizer(examples['text'],
        examples['Context'],
        add_special_tokens=True,  # Includes [CLS] and [SEP]
        padding='max_length',
        max_length=max_length,
        truncation=True,)


  labels_batch = {k: examples[k] for k in examples.keys() if k in labels_names}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(examples['text']), len(labels_names)))


  # fill numpy array
  for idx, label in enumerate(labels_names):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding


In [21]:
def flatten_dict(d, parent_key='', sep='_'):
    """Recursively flattens a nested dictionary."""
    items = []

    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):  # If the value is a dictionary, recurse
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))

    return dict(items)


def drop_precision_recall(d):
    """Recursively drops 'precision' and 'recall' from a nested dictionary."""
    if isinstance(d, dict):
        # Create a copy of the dictionary to prevent modifying it while iterating
        result = {}
        for k, v in d.items():
            if k not in ['precision', 'recall']:  # Skip precision and recall keys
                result[k] = drop_precision_recall(v)  # Recursively process values
        return result
    else:
        return d




def save_results(base_path):

    metrics = {
        "eval_Ambiguity in Expression_f1-score": [],
        "eval_Conditional Clauses_f1-score": [],
        "eval_Cross-Dependent Clauses_f1-score": [],
        "eval_Interpretable_f1-score": [],
        "eval_Legal Terminology_f1-score": [],
        "eval_accuracy": [],
        "eval_loss": [],
        "eval_macro avg_f1-score": [],
        "eval_macro_f1": [],
        "eval_micro avg_f1-score": [],
        "eval_micro_f1": [],
        "eval_samples avg_f1-score": [],
        "eval_weighted avg_f1-score": []
    }


    for folder_name in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder_name)


        if os.path.isdir(folder_path):
            results_file = os.path.join(folder_path, "all_results.json")


            if os.path.exists(results_file):
                with open(results_file, "r") as f:
                    data = json.load(f)

                    for key in metrics.keys():
                        if key in data:
                            metrics[key].append(data[key])


    summary_metrics = {}
    for key, values in metrics.items():
        if values:
            summary_metrics[key] = {
                "mean": np.mean(values),
                "std": np.std(values)
            }

    # Save the summary metrics to a JSON file
    output_file = os.path.join(base_path, "summary_metrics.json")
    with open(output_file, "w") as f:
        json.dump(summary_metrics, f, indent=4)

    print(f"Summary metrics saved to {output_file}")



def multi_label_metrics(predictions, labels, threshold=0.5):

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    y_true = labels
    micro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='micro', zero_division=0)
    macro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='macro', zero_division=0)


    accuracy = accuracy_score(y_true, y_pred)

    metrics = {"micro_f1": micro_f1,
               "macro_f1": macro_f1,
               "accuracy": accuracy}



    report = classification_report(y_true, y_pred,output_dict=True, target_names=labels_names)

    report = drop_precision_recall(report)



    metrics.update(report)

    metrics = flatten_dict(metrics)

    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result




In [22]:
def train(train_df,test_df,fold):


    model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                            problem_type="multi_label_classification",
                                                            num_labels=len(labels_names),
                                                            id2label=id2label,
                                                            label2id=label2id,
                                                            # num_hidden_layers = 1
                                                            )

    df_train = Dataset.from_pandas(train_df)
    df_val = Dataset.from_pandas(test_df)


    encoded_dataset_train = df_train.map(preprocess_data, batched=True , remove_columns=df_train.column_names)
    encoded_dataset_val = df_val.map(preprocess_data, batched=True , remove_columns=df_val.column_names)


    encoded_dataset_train.set_format("torch")
    encoded_dataset_val.set_format("torch")


    args = TrainingArguments(
    f"{folder}{str(fold)}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs ,
    weight_decay=0.01,
    save_total_limit=2,
    metric_for_best_model=metric_name,
    report_to="none"

    )


    trainer = Trainer(
        model,
        args,
        train_dataset=encoded_dataset_train,
        eval_dataset=encoded_dataset_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )


    t = trainer.train()
    e = trainer.evaluate()

    trainer.log_metrics("train", t.metrics)
    trainer.save_metrics("train", t.metrics)

    trainer.log_metrics("eval", e)
    trainer.save_metrics("eval", e)


    del model
    torch.cuda.empty_cache()


In [23]:
df = pd.read_csv("/content/sample-data-final.csv")

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=seed)


#All at once

if(flag2 == 1):

    for fold, (train_index, test_index) in enumerate(kf.split(df)):
        train_df = df.iloc[train_index]
        test_df = df.iloc[test_index]
        print()
        print(f"Fold {fold}:")
        print()
        train(train_df,test_df,fold)

    save_results(folder)




#only the desired split

else:

    for fold, (train_index, test_index) in enumerate(kf.split(df)):
        if fold == fold_index:
            print()
            print(f"Fold {fold}:")
            print()
            train_df = df.iloc[train_index]
            test_df = df.iloc[test_index]

            # Call the train function for the specific fold
            train(train_df, test_df, fold)

            save_results(folder)


Fold 0:



pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

  trainer = Trainer(
Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,Accuracy,Conditional clauses F1-score,Conditional clauses Support,Cross-dependent clauses F1-score,Cross-dependent clauses Support,Legal terminology F1-score,Legal terminology Support,Ambiguity in expression F1-score,Ambiguity in expression Support,Interpretable F1-score,Interpretable Support,Micro avg F1-score,Micro avg Support,Macro avg F1-score,Macro avg Support,Weighted avg F1-score,Weighted avg Support,Samples avg F1-score,Samples avg Support
1,No log,0.413024,0.666667,0.177778,0.4,0.888889,12.0,0.0,2.0,0.0,5.0,0.0,2.0,0.0,0.0,0.666667,21.0,0.177778,21.0,0.507937,21.0,0.666667,21.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


***** train metrics *****
  epoch                    =        1.0
  total_flos               =    49643GF
  train_loss               =     0.4875
  train_runtime            = 0:03:05.66
  train_samples_per_second =      0.307
  train_steps_per_second   =      0.156
***** eval metrics *****
  epoch                                 =        1.0
  eval_Ambiguity in Expression_f1-score =        0.0
  eval_Ambiguity in Expression_support  =        2.0
  eval_Conditional Clauses_f1-score     =     0.8889
  eval_Conditional Clauses_support      =       12.0
  eval_Cross-Dependent Clauses_f1-score =        0.0
  eval_Cross-Dependent Clauses_support  =        2.0
  eval_Interpretable_f1-score           =        0.0
  eval_Interpretable_support            =        0.0
  eval_Legal Terminology_f1-score       =        0.0
  eval_Legal Terminology_support        =        5.0
  eval_accuracy                         =        0.4
  eval_loss                             =      0.413
  eval_macro avg_f1-

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

  trainer = Trainer(
Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,Accuracy,Conditional clauses F1-score,Conditional clauses Support,Cross-dependent clauses F1-score,Cross-dependent clauses Support,Legal terminology F1-score,Legal terminology Support,Ambiguity in expression F1-score,Ambiguity in expression Support,Interpretable F1-score,Interpretable Support,Micro avg F1-score,Micro avg Support,Macro avg F1-score,Macro avg Support,Weighted avg F1-score,Weighted avg Support,Samples avg F1-score,Samples avg Support
1,No log,0.36913,0.717949,0.193103,0.466667,0.965517,14.0,0.0,1.0,0.0,8.0,0.0,1.0,0.0,0.0,0.717949,24.0,0.193103,24.0,0.563218,24.0,0.76,24.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


***** train metrics *****
  epoch                    =        1.0
  total_flos               =    49643GF
  train_loss               =     0.5211
  train_runtime            = 0:03:09.09
  train_samples_per_second =      0.301
  train_steps_per_second   =      0.153
***** eval metrics *****
  epoch                                 =        1.0
  eval_Ambiguity in Expression_f1-score =        0.0
  eval_Ambiguity in Expression_support  =        1.0
  eval_Conditional Clauses_f1-score     =     0.9655
  eval_Conditional Clauses_support      =       14.0
  eval_Cross-Dependent Clauses_f1-score =        0.0
  eval_Cross-Dependent Clauses_support  =        1.0
  eval_Interpretable_f1-score           =        0.0
  eval_Interpretable_support            =        0.0
  eval_Legal Terminology_f1-score       =        0.0
  eval_Legal Terminology_support        =        8.0
  eval_accuracy                         =     0.4667
  eval_loss                             =     0.3691
  eval_macro avg_f1-

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

  trainer = Trainer(
Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Epoch,Training Loss,Validation Loss
