# Fine Tuning

In this script, we will fine tune a model to be able to generate good quality headnotes for Indian court judgements.

## Imports and Configs

In [None]:
%pip install datasets
%pip install evaluate
%pip install bitsandbytes
%pip install rouge_score
%pip install bert_score
%pip install bleu_score
%pip install hf_xet

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia

In [None]:
from datasets import Dataset, DatasetDict
from google.colab import userdata
from huggingface_hub import login
from peft import AutoPeftModelForCausalLM, get_peft_model, LoraConfig, prepare_model_for_kbit_training
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling, TrainingArguments, Trainer
import ast
import csv
import evaluate
import nltk
import numpy as np
import os
import pandas as pd
import re
import torch
import wandb

In [None]:
# i have trouble installing this sometimes so just check that it works separately
import bitsandbytes
print(bitsandbytes.__version__)

0.46.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
INSTRUCTION = """\
Case judgements represent rulings issued by judges in judicial proceedings, often comprising extensive transcripts spanning dozens of pages.
Your task is to compose the headnote for the given case judgement. An excellent headnote captures the core of the judgment properly, succinctly, and completely.

Here are the core elements of a headnote:
- PAST TENSE: The headnote is written in the past tense.
- METADATA: The headnote includes the case name, judgement number, court, judge(s), and date of judgement.
- INTRODUCTION: The headnote introduction describes the field of law that the case deals with, and does not directly jump into the case details.
- KEY FACTS: The headnote includes who filed the case, why the case was filed, and what remedy the filer wants.
- KEY ARGUMENTS: The headnote includes both sides' submissions and reasons to support their position.
- RELEVANT LAWS: The headnote references relevant legislation and concepts to support the case judgement.
- CONCLUSION: The headnote includes the case's conclusion and procedural disposition (ex: permitted, dismissed, reversed, remanded, affirmed, etc.).

VERY IMPORTANT: Do not generate multiple paragraphs or sections. Write a single paragraph that does not exceed 800 words, or a single page.
"""

In [None]:
TOKENIZER_CUTOFF = 10000
SEED = 8

In [None]:
login(userdata.get('HF_TOKEN'))
wandb.login(key=userdata.get('WANDB_TOKEN'))

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmihikabairathi[0m ([33mmihikabairathi-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
rouge_score = evaluate.load("rouge")
bert_score = evaluate.load("bertscore")
bleu_score = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [None]:
nltk.download('words')
correct_words = nltk.corpus.words.words()

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


## Things to Configure

In [None]:
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
BASE_OUTPUT_DIR = "/content/drive/MyDrive/RC"
OUTPUT_DIR = BASE_OUTPUT_DIR + "/trainer_output"
TEST_FILE_DIR = f'{BASE_OUTPUT_DIR}/test_dataset.csv'
HEADNOTES_OUTPUT_DIR = f'{BASE_OUTPUT_DIR}/test_dataset_results.csv'
RUN_NAME = "final_run_meta"
LORA_R = 32
LORA_ALPHA = 64
GRADIENT_ACCUMULATION_STEPS = 16
NUM_TRAIN_EPOCHS = 5
LEARNING_RATE = 1e-4

## Data Selection

In [None]:
df = pd.read_csv(f'{BASE_OUTPUT_DIR}/judgements_and_graded_headnotes.csv')
df.drop_duplicates(subset=['headnote'], inplace=True)
df.dropna(inplace=True)
df["cleaned_response"].value_counts()

Unnamed: 0_level_0,count
cleaned_response,Unnamed: 1_level_1
MEDIUM,5618
GREAT,1180
POOR,216
INVALID RESPONSE,62


In [None]:
# only keep the rows with great or medium headnotes, oversample the great headnotes
df = df[(df["cleaned_response"] != "POOR") & (df["cleaned_response"] != "INVALID RESPONSE")]
good_df = df[df["cleaned_response"] == "GREAT"]
df = pd.concat([df, good_df])
df.drop(columns=["cleaned_response", "response"], inplace=True)

# format the columns into an Alpaca-style dataset for fine-tuning
df.rename(columns={"judgement": "input", "headnote": "output"}, inplace=True)
df["instruction"] = df.apply(lambda row: INSTRUCTION, axis=1)
df.describe()

Unnamed: 0,input,output,instruction
count,7978,7978,7978
unique,6798,6798,1
top,No. LIX of 1949.\nAppeal from the judgment of ...,In construing a document whether in English or...,Case judgements represent rulings issued by ju...
freq,2,2,7978


## Data Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
def tokenize_entry(data_entries, use_output=True):
    prompts = []
    for instruction, input, output in zip(data_entries['instruction'], data_entries['input'], data_entries['output']):
      if not use_output:
        output = ""
      prompts.append(f"""\
        Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

        ### Instruction:
        {instruction}

        ### Input:
        {input}

        ### Response:
        {output}
      """
    )

    # no need to pad here, we will use the collator later for dynamic padding
    # better option since the input lengths are so variable
    tokenized_entries = tokenizer(prompts)
    return tokenized_entries

In [None]:
ds = Dataset.from_pandas(df)
ds = ds.remove_columns("__index_level_0__")
ds = ds.map(tokenize_entry, batched=True)

# due to excessive GPU usage, we will not consider inputs with too many tokens
ds = ds.filter(lambda example: len(example['input_ids']) < TOKENIZER_CUTOFF)

Map:   0%|          | 0/7978 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7978 [00:00<?, ? examples/s]

In [None]:
train_split = ds.train_test_split(test_size=0.2, shuffle=True, seed=SEED)
test_split = train_split["test"].train_test_split(test_size=0.5, shuffle=True, seed=SEED)

split_ds = DatasetDict({"train": train_split["train"], "eval": test_split["train"], "test": test_split["test"]})
split_ds['test'] = split_ds['test'].remove_columns(['input_ids', 'attention_mask'])
split_ds

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction', 'input_ids', 'attention_mask'],
        num_rows: 5574
    })
    eval: Dataset({
        features: ['input', 'output', 'instruction', 'input_ids', 'attention_mask'],
        num_rows: 697
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 697
    })
})

## Train the Model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    # for QLoRA
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        # these next two parameters are being set since we are fine-tuning
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True
    )
)
model.config.use_cache = False

model = prepare_model_for_kbit_training(model)
model = get_peft_model(
    model,
    LoraConfig(
      r=LORA_R,
      lora_alpha=LORA_ALPHA,
      lora_dropout=0.05,
      target_modules="all-linear",
      bias="none",
      task_type="CAUSAL_LM"
    )
)

model.print_trainable_parameters()

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

trainable params: 83,886,080 || all params: 8,114,147,328 || trainable%: 1.0338


In [None]:
trainer = Trainer(
    model=model,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=split_ds["train"],
    eval_dataset=split_ds["eval"],
    args=TrainingArguments(
      do_train=True,
      do_eval=True,
      bf16=True,
      load_best_model_at_end=True,
      gradient_checkpointing=True,
      group_by_length=True,
      eval_on_start=True,

      per_device_train_batch_size=1,
      per_device_eval_batch_size=1,

      report_to="wandb",
      run_name=RUN_NAME,

      output_dir=OUTPUT_DIR,
      eval_strategy='steps',
      save_strategy='steps',
      seed=SEED,
      optim="adamw_bnb_8bit",

      neftune_noise_alpha=5,
      save_total_limit=5,
      logging_steps=100,
      save_steps=100,
      eval_steps=100,
      warmup_ratio=0.01,

      gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
      num_train_epochs=NUM_TRAIN_EPOCHS,
      learning_rate=LEARNING_RATE
    )
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
resume_from_checkpoint = os.path.isdir(OUTPUT_DIR) and any(os.scandir(OUTPUT_DIR))
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
trainer.save_model(OUTPUT_DIR)

Step,Training Loss,Validation Loss
1745,1.0954,1.276749


Could not locate the best model at /content/drive/MyDrive/RC/meta/trainer_output/checkpoint-1700/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


## Generate Test Headnotes

In [None]:
if not os.path.isfile(TEST_FILE_DIR):
  split_ds['test'] = split_ds['test'].map(lambda entry: tokenize_entry(entry, use_output=False), batched=True)
  split_ds['test'].to_csv(TEST_FILE_DIR)

test_df = pd.read_csv(TEST_FILE_DIR)
tokenized_test_entries = tokenize_entry(test_df, use_output=False)
test_df['input_ids'] = tokenized_test_entries['input_ids']
test_df['attention_mask'] = tokenized_test_entries['attention_mask']

if os.path.exists(HEADNOTES_OUTPUT_DIR):
  num_processed = len(pd.read_csv(HEADNOTES_OUTPUT_DIR))
else:
  num_processed = 0

test_df = test_df.iloc[num_processed:]
print(f'Number of records left to process: {len(test_df)}')
test_df.head()

Number of records left to process: 130


Unnamed: 0,input,output,instruction,input_ids,attention_mask
567,ivil Appeal No. 165 of 1974 etc.\nFrom the Jud...,"Section 7D of the Kerala Land Reforms Act, 196...",Case judgements represent rulings issued by ju...,"[128000, 286, 21883, 374, 459, 7754, 430, 1696...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
568,ivil Appeal No. 1043 of 1990.\nFrom the Judgme...,One Motilal who owned Goyal Talkies entered in...,Case judgements represent rulings issued by ju...,"[128000, 286, 21883, 374, 459, 7754, 430, 1696...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
569,l Appeals Nos. 1494 to 1498 of 1971.\nAppeals ...,In pursuance of an agreement between the asses...,Case judgements represent rulings issued by ju...,"[128000, 286, 21883, 374, 459, 7754, 430, 1696...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
570,: Criminal Appeal No. 37 of 1991.\nFrom the Ju...,Respondent No. 1 is a firm dealing in medicine...,Case judgements represent rulings issued by ju...,"[128000, 286, 21883, 374, 459, 7754, 430, 1696...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
571,Civil Appeal Nos. 95 and 96 of 1971.\nFrom the...,"By an order dated 31st December, 1968, the sal...",Case judgements represent rulings issued by ju...,"[128000, 286, 21883, 374, 459, 7754, 430, 1696...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
trained_model = AutoPeftModelForCausalLM.from_pretrained(f'{OUTPUT_DIR}', torch_dtype=torch.bfloat16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
def append_to_csv(entries):
    if not os.path.exists(HEADNOTES_OUTPUT_DIR):
      entries = [['Judgement', 'Test Headnote', 'Generated Headnote']] + entries
    with open(HEADNOTES_OUTPUT_DIR, 'a') as file:
        writer = csv.writer(file)
        writer.writerows(entries)

In [None]:
def clean_headnote(headnote):
  # remove prompt and extra headings, as eos token doesn't always work
  headnote = headnote.split('### Response:')[1].split('###')[0]

  # remove last sentence if its not complete - this is a naive approach but works in most cases
  last_seen_period = headnote.rfind('.')
  if last_seen_period != len(headnote) - 1:
    headnote = headnote[:last_seen_period+1]

  # double spacing issues
  headnote = re.sub(r'\s{2,}', ' ', headnote)

  # handle new lines - if there is a \n but it does not precede a Capital letter and does not succeed a period then replace with a space
  headnote = re.sub(r'(?<![^0-9]\.)\n|(?<=\.)\n(?![A-Z])', ' ', headnote)

  # random HELD words
  headnote = re.sub('HELD', 'held', headnote)
  headnote = re.sub('^HELD', 'held', headnote)

  # fix spelling mistakes
  headnote_words = headnote.split()
  for i in range(len(headnote_words) - 1):
    left_word = headnote_words[i]
    right_word = headnote_words[i+1]
    if left_word.lower() not in correct_words and right_word.lower() not in correct_words and left_word.lower()+right_word.lower() in correct_words:
      headnote = re.sub(f'{left_word} {right_word}', f'{left_word}{right_word}', headnote)

  # last-minute cleaning
  return headnote.strip()

In [None]:
for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
  input_ids = torch.tensor(row['input_ids']).unsqueeze(0).to("cuda")
  attention_mask = torch.tensor(row['attention_mask']).unsqueeze(0).to("cuda")

  # use beam v/s greedy or sampling for generation technique
  outputs = trained_model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_new_tokens=1000,
      pad_token_id=tokenizer.pad_token_id,
      eos_token_id=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids('###')],
      repetition_penalty=1.5,
      no_repeat_ngram_size=4,
      num_beams=4,
      length_penalty=1.2
  )

  generated_headnote = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
  generated_headnote = clean_headnote(generated_headnote)
  append_to_csv([[row['input'], row['output'], generated_headnote]])

  0%|          | 0/130 [00:00<?, ?it/s]

## Evaluate the Model

In [None]:
avg_rougeLsum_score = 0
avg_bert_f1_score = 0
avg_bleu_score = 0
test_df_results = pd.read_csv(HEADNOTES_OUTPUT_DIR)

for idx, row in test_df_results.iterrows():
  test_headnote = row['Test Headnote']
  generated_headnote = row['Generated Headnote']

  avg_rougeLsum_score += rouge_score.compute(predictions=[generated_headnote], references=[test_headnote])['rougeLsum']
  avg_bert_f1_score += bert_score.compute(predictions=[generated_headnote], references=[test_headnote], lang="en")['f1'][0]
  avg_bleu_score += bleu_score.compute(predictions=[generated_headnote], references=[test_headnote])['bleu']

avg_rougeLsum_score /= len(test_df_results)
avg_bert_f1_score /= len(test_df_results)
avg_bleu_score /= len(test_df_results)

print(avg_rougeLsum_score, avg_bert_f1_score, avg_bleu_score)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.4947482873001264 0.8563605178377378 0.10598238431802125
