# Import Libraries

In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
import random
import nltk
nltk.download('punkt')

from IPython.display import display, HTML
import torch
import datasets
from datasets import load_dataset, load_metric, Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import LEDTokenizer, LEDForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\naman\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
  from .autonotebook import tqdm as notebook_tqdm


## Data Preprocessing Model

In [2]:
model_name = "nsi319/legal-led-base-16384"
tokenizer = AutoTokenizer.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [4]:
val_files = [] # Add the validation files to be used

In [5]:
def getData(dataPath):
	documentPath = f'{dataPath}/judgement'
	summaryPath = f'{dataPath}/summary'
	dataset = {'document':[], 'summary':[]}
	count = 0
	for file in os.listdir(documentPath):
		count += 1
		if os.stat(f'{documentPath}/{file}').st_size == 0 or os.stat(f'{summaryPath}/{file}').st_size == 0:
			continue			
		doc_in = open(f'{documentPath}/{file}', 'r', encoding='utf8')
		doc_lines = [line.strip() for line in doc_in.readlines()]
		summ_in = open(f'{summaryPath}/{file}', 'r', encoding='utf8')
		summ_lines = [line.strip() for line in summ_in.readlines()]
		if len(doc_lines) == 0 or len(summ_lines) == 0:
			continue
		dataset['document'].append(' '.join(doc_lines))
		dataset['summary'].append(' '.join(summ_lines))
	df = pd.DataFrame(dataset)
	return df

In [6]:
exp = 'exp1'
encoder_max_length = 1024*16
decoder_max_length = 1024
batch_size = 1
n_epochs = 3

IN-Abs : Indian Supreme Court case documents & their `abstractive' summaries

Training Dataset: 7030

Test Dataset: 100

In [12]:
dataPath = "dataset/IN-Abs"

# Train Dataset
train_df = getData(f'{dataPath}/train-data')
train_dataset = Dataset.from_pandas(train_df)

# Test Dataset
test_df = getData(f'{dataPath}/test-data')
test_dataset = Dataset.from_pandas(test_df)

In [13]:
print("Train Dataset\n")
print (train_dataset,"\n")
for i in range(5):
    print(train_dataset[i])

Train Dataset

Dataset({
    features: ['document', 'summary'],
    num_rows: 7028
}) 

{'document': 'Appeal No. LXVI of 1949. Appeal from the High Court of judicature, Bombay, in a reference under section 66 of the Indian Income tax Act, 1022. K.M. Munshi (N. P. Nathvani, with him), for the appel lant. \' M.C. Setalvad, Attorney General for India (H. J. Umrigar, with him), for the respondent. 1950. May 26. The judgment of the Court was delivered by MEHR CHAND MAHAJAN J. This is an appeal against a judgment of the High Court of Judicature at Bombay in an income tax matter and it raises the question whether munici pal property tax and urban immoveable property tax payable under the relevant Bombay Acts are allowable deductions under section 9 (1) (iv) of the Indian Income tax Act. The assessee company is an investment company deriving its income from properties in the city of Bombay. For the assessment year 1940 41 the net income of the assessee under the head "property" was computed by

In [14]:
print("Test Dataset\n")
print (test_dataset,"\n")
for i in range(5):
    print(test_dataset[i])

Test Dataset

Dataset({
    features: ['document', 'summary'],
    num_rows: 100
}) 

{'document': 'Appeal No. 101 of 1959. Appeal by special leave from the judgment and order dated November 8, 1957, of the Deputy Custodian General, Evacuee Property, Now Delhi Revision Petition No. 17 R/55 of 1955. Achhru Ram and K. L. Mehta for the appellants. B.K., Khanna and, T. M. Sen, for the respondent No. 1. N.S. Bindra and A. G. Ratnaparkhi, for the respondents Nos. March 15. The Judgment of the Court was delivered by MUDHOLKAR J. The appellants who are admittedly displaced persons from West Pakistan were granted quasi permanent allotment of 24 standard acres and 15 3/4 units in the village of Raikot in Ludhiana District in 1949. Their father Sardar Nand Singh who was 42 330 found entitled to quasi permanent allotment of 40 standard acres and 5 1/4 units of land was given quasipermanent allotment in another village named Humbran in the same district. The two villages are, however, 25 miles or s

### Preprocess Data

In [15]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["document"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
    )
    outputs = tokenizer(
        batch["summary"],
        padding="max_length",
        truncation=True,
        max_length=decoder_max_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch


In [16]:
# Map train data

train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["document", "summary"],
)

Map: 100%|██████████| 7028/7028 [08:25<00:00, 13.89 examples/s]


In [17]:
# Map test data

test_dataset = test_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["document", "summary"],
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map: 100%|██████████| 100/100 [00:08<00:00, 12.01 examples/s]


In [18]:
print("Train Dataset\n")
print (train_dataset,"\n")
for i in range(2):
    print(train_dataset[i])

Train Dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
    num_rows: 7028
}) 

{'input_ids': [0, 19186, 18696, 440, 4, 43908, 15176, 9, 25345, 4, 16049, 31, 5, 755, 837, 9, 21392, 636, 18830, 6, 28197, 6, 11, 10, 5135, 223, 2810, 5138, 9, 5, 1362, 9628, 629, 1783, 6, 158, 2036, 4, 229, 4, 448, 4, 6760, 24441, 36, 487, 4, 221, 4, 22634, 705, 1543, 6, 19, 123, 238, 13, 5, 1553, 523, 784, 927, 4, 128, 256, 4, 347, 4, 8504, 337, 705, 625, 6, 2745, 1292, 13, 666, 36, 725, 4, 344, 4, 12698, 7638, 271, 6, 19, 123, 238, 13, 5, 33802, 4, 9323, 4, 392, 973, 4, 20, 7579, 9, 5, 837, 21, 2781, 30, 12341, 16271, 3858, 5945, 8981, 6826, 863, 1889, 344, 4, 152, 16, 41, 2868, 136, 10, 7579, 9, 5, 755, 837, 9, 19691, 636, 18830, 23, 28197, 11, 41, 1425, 629, 948, 8, 24, 7700, 5, 864, 549, 32868, 13850, 8750, 1038, 629, 8, 4879, 15192, 7067, 868, 1038, 629, 21467, 223, 5, 4249, 28197, 37219, 32, 41741, 23091, 223, 2810, 361, 36, 134, 43, 36, 1879, 43, 9

In [19]:
print("Test Dataset\n")
print (test_dataset,"\n")
for i in range(2):
    print(test_dataset[i])

Test Dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
    num_rows: 100
}) 

{'input_ids': [0, 19186, 18696, 440, 4, 6560, 9, 23342, 4, 16049, 30, 780, 989, 31, 5, 7579, 8, 645, 7000, 759, 290, 6, 23778, 6, 9, 5, 4269, 37513, 1630, 811, 1292, 6, 7652, 1043, 1780, 242, 10491, 6, 978, 3534, 45323, 40505, 440, 4, 601, 248, 73, 3118, 9, 24436, 4, 23790, 298, 2070, 3513, 8, 229, 4, 226, 4, 1464, 6083, 102, 13, 5, 44361, 3277, 4, 163, 4, 530, 482, 2218, 4057, 8, 6, 255, 4, 256, 4, 2211, 6, 13, 5, 33802, 440, 4, 112, 4, 234, 4, 104, 4, 37685, 763, 8, 83, 4, 272, 4, 12041, 282, 1115, 3994, 3592, 6, 13, 5, 10011, 29603, 4, 494, 379, 4, 20, 44319, 9, 5, 837, 21, 2781, 30, 256, 13083, 725, 3384, 530, 2747, 344, 4, 20, 44361, 3277, 54, 32, 30889, 9871, 5151, 31, 580, 1752, 58, 4159, 32064, 4398, 29887, 1757, 9, 706, 2526, 6419, 8, 379, 155, 73, 306, 2833, 11, 5, 3375, 9, 4833, 967, 1242, 11, 21024, 298, 8878, 1384, 11, 25345, 4, 2667, 1150, 23892

In [20]:
# Set Python list to PyTorch tensor

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)
test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

## Training Model

### Evaluation Metrics Rogue

In [10]:
# Loading Rouge Metric
rouge = load_metric("rouge")

def postprocess_text(preds, labels):
	preds = [pred.strip() for pred in preds]
	labels = [label.strip() for label in labels]

	# rougeLSum expects newline after each sentence
	preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
	labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

	return preds, labels


def compute_metrics(pred):
	labels_ids = pred.label_ids
	pred_ids = pred.predictions

	pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
	labels_ids[labels_ids == -100] = tokenizer.pad_token_id
	label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

	# Some simple post-processing
	pred_str, label_str = postprocess_text(pred_str, label_str)
	
	result = rouge.compute(
		predictions=pred_str, references=label_str, use_stemmer=True
	)

	# Extract a few results from ROUGE
	result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

	prediction_lens = [
		np.count_nonzero(pred != tokenizer.pad_token_id) for pred in pred_ids
	]
	result["gen_len"] = np.mean(prediction_lens)
	result = {k: round(v, 4) for k, v in result.items()}
	
	return result

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


### Define Model

In [9]:
training_args = Seq2SeqTrainingArguments(
	output_dir=f"results/led/final/{exp}",
	num_train_epochs=n_epochs,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	# fp16=True,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	metric_for_best_model="eval_rouge2",
	greater_is_better=True,
	warmup_steps=200,
	predict_with_generate=True,
	logging_dir=f"led_logs/final/{exp}",
	logging_steps=50,
    gradient_accumulation_steps=4,
	save_total_limit=1 #save only the best model
)

In [21]:
# Loading Pretrained LED Model & Enabling gradient checkpointing & Disabling cache for checkpointing

led = AutoModelForSeq2SeqLM.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False)


# led.resize_token_embeddings(len(tokenizer))

# Setting generation hyperparameters

led.config.num_beams = 2
led.config.max_length = decoder_max_length
led.config.min_length = 256
# led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 4

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=led,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
trainer.train()

#Save the finetuned model
# model_checkpoint_dir = f"results/led/{exp}/best_model"
# trainer.save_model(model_checkpoint_dir)

trainer.save_model("./final_model/IN_model")

  0%|          | 0/5271 [00:00<?, ?it/s]

KeyboardInterrupt: 