<a href="https://colab.research.google.com/github/nitajadav8/Legal_TextSumm/blob/main/TextProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
%cd /content/drive/MyDrive/justnlp/summarization

/content/drive/MyDrive/justnlp/summarization


# Train Data

In [7]:
import json
import pandas as pd
def load_jsonl(path):
    records = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line))
    return pd.DataFrame(records)

train_path='./dataset/train/train_judg.jsonl'
judg=load_jsonl(train_path)

In [8]:
judg.head()

Unnamed: 0,ID,Judgment
0,id_10,Case :- WRIT - C No. - 11383 of 2023\nPetition...
1,id_1000,Versus\nAppearance:\nand\nDate : 22/09/2023\n1...
2,id_1001,Non-Reportable\nCriminal Appeal No._________ o...
3,id_1002,---- Appellant\nVersus\nUmesh Sharma S/o Late...
4,id_1003,1Whether Reporters of Local Papers may be allo...


In [9]:
judg.shape

(1200, 2)

In [10]:
import nltk
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:

judg["sentence_count"] = judg["Judgment"].apply(lambda x: len(nltk.sent_tokenize(x)))
judg["unique_token_count"] = judg["Judgment"].apply(
    lambda x: len(set(nltk.word_tokenize(x.lower())))
)
print(judg[["ID", "sentence_count", "unique_token_count"]])

           ID  sentence_count  unique_token_count
0       id_10             258                 725
1     id_1000              51                 459
2     id_1001              56                 515
3     id_1002              79                 680
4     id_1003              60                 585
...       ...             ...                 ...
1195   id_995             309                1563
1196   id_996             346                1508
1197   id_997              33                 390
1198   id_998              80                 482
1199   id_999             100                 572

[1200 rows x 3 columns]


In [12]:
judg['Judgment'].isnull().sum()

np.int64(0)

In [13]:
duplicates = judg[judg.duplicated(subset=["ID", "Judgment"], keep=False)]
len(duplicates)

0

In [14]:
print(judg['sentence_count'].min())
print(judg['sentence_count'].max())
print(judg['sentence_count'].mean())

1
5690
284.4691666666667


In [15]:
all_tokens = judg["Judgment"].apply(lambda x: nltk.word_tokenize(x.lower()))
judg_vocab = set([token for row in all_tokens for token in row])


In [16]:
len(judg_vocab)

214204

In [17]:
train_ref_path='./dataset/train/train_ref_summ.jsonl'
ref=load_jsonl(train_ref_path)

In [18]:
ref.head()

Unnamed: 0,ID,Summary
0,id_10,The Allahabad High Court on Thursday stayed th...
1,id_1000,A convict in Gujarat who had secured bail in 2...
2,id_1001,A police officer failing in their fundamental ...
3,id_1002,The Chhattisgarh High Court recently observed ...
4,id_1003,The Gujarat High Court recently quashed a Firs...


In [19]:
ref.shape

(1200, 2)

In [20]:
ref['Summary'].isnull().sum()

np.int64(0)

In [21]:
duplicates = ref[ref.duplicated(subset=["ID", "Summary"], keep=False)]
len(duplicates)

0

In [22]:
ref['word']=ref['Summary'].apply(lambda x: len(nltk.word_tokenize(x)))
print(ref['word'].mean())
print(ref['word'].min())
print(ref['word'].max())

609.9591666666666
29
2412


In [23]:

ref["s_sentence_count"] = ref["Summary"].apply(lambda x: len(nltk.sent_tokenize(x)))
ref["s_unique_token_count"] = ref["Summary"].apply(
    lambda x: len(set(nltk.word_tokenize(x.lower())))
)
print(ref[["ID", "s_sentence_count", "s_unique_token_count"]])

           ID  s_sentence_count  s_unique_token_count
0       id_10                22                   305
1     id_1000                17                   215
2     id_1001                17                   239
3     id_1002                17                   191
4     id_1003                16                   234
...       ...               ...                   ...
1195   id_995                26                   284
1196   id_996                20                   293
1197   id_997                18                   216
1198   id_998                14                   213
1199   id_999                23                   214

[1200 rows x 3 columns]


In [24]:
print(ref['s_unique_token_count'].min())
print(ref['s_unique_token_count'].max())
print(ref['s_unique_token_count'].mean())

22
549
243.5975


In [25]:
print(ref['s_sentence_count'].min())
print(ref['s_sentence_count'].max())
print(ref['s_sentence_count'].mean())

1
73
19.948333333333334


In [26]:
s_all_tokens = ref["Summary"].apply(lambda x: nltk.word_tokenize(x.lower()))
sum_vocab = set([token for row in s_all_tokens for token in row])
len(sum_vocab)

24913

In [27]:
#sum_vocab

In [28]:
# Common words
common_vocab = judg_vocab & sum_vocab

# Words only in judgments (less important for summaries)
extra_judg_vocab = judg_vocab - sum_vocab


In [29]:
len(common_vocab)

22358

In [30]:
len(extra_judg_vocab)

191846

In [31]:
#extra_judg_vocab

In [32]:
extra_sum_vocab = sum_vocab - judg_vocab

In [33]:
#extra_sum_vocab

In [34]:
#reliability check of legal pegasus tokenizer for the dataset
from transformers import PegasusTokenizer
from nltk.tokenize import word_tokenize
from tqdm import tqdm

tokenizer = PegasusTokenizer.from_pretrained("nsi319/legal-pegasus")


dataset_vocab = set()
for text in list(judg["Judgment"]) + list(ref["Summary"]):
    dataset_vocab.update(word_tokenize(text.lower()))

# Stats
single_token = []
split_token = []

for word in tqdm(dataset_vocab, desc="Checking coverage"):
    tokens = tokenizer.tokenize(word)
    if len(tokens) == 1:
        single_token.append(word)
    else:
        split_token.append((word, tokens))

print("\n Coverage Report")
print(f"Total unique words in dataset vocab: {len(dataset_vocab)}")
print(f"Words covered as single token: {len(single_token)} ({len(single_token)/len(dataset_vocab)*100:.2f}%)")
print(f"Words split into multiple tokens: {len(split_token)} ({len(split_token)/len(dataset_vocab)*100:.2f}%)")

print("\nExamples of words split into multiple tokens:")
for w, t in split_token[:20]:
    print(f"{w} -> {t}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Checking coverage: 100%|██████████| 216759/216759 [00:03<00:00, 69562.61it/s]


 Coverage Report
Total unique words in dataset vocab: 216759
Words covered as single token: 23125 (10.67%)
Words split into multiple tokens: 193634 (89.33%)

Examples of words split into multiple tokens:
8216-8217. -> ['▁', '821', '6-8', '217', '.']
besomo -> ['▁be', 's', 'omo']
40page -> ['▁40', 'page']
3157 -> ['▁3', '157']
jigarbhai -> ['▁jig', 'ar', 'bhai']
ummary -> ['▁um', 'mary']
non-arraigning -> ['▁non', '-', 'arra', 'ign', 'ing']
25.04.2002. -> ['▁25', '.', '04.', '2002', '.']
11.12.1996. -> ['▁11.1', '2.', '1996', '.']
181. -> ['▁181', '.']
18.02.2014 -> ['▁18', '.02.201', '4']
prosecntion -> ['▁prose', 'cn', 'tion']
madhav -> ['▁mad', 'hav']
153. -> ['▁153', '.']
baijal -> ['▁', 'bai', 'jal']
creae66m49en -> ['▁cre', 'ae', '66', 'm', '49', 'en']
136. -> ['▁136', '.']
theogrcencnt -> ['▁the', 'og', 'rc', 'enc', 'nt']
1.5.2009152a -> ['▁1.5', '.', '2009', '152', 'a']
guideli -> ['▁guide', 'li']





In [35]:
#cleaning data
import re
def clean_text(df, column):
  #df[column]=df[column].apply(lambda x: x.lower())
  df[column] = df[column].apply(lambda x: re.sub(r'http|https:\S+|www\.\S+', '', x))
  #df[column] = df[column].apply(lambda x:re.sub(r'[^\x00-\x7F]+', ' ', x))
  #df[column] = df[column].apply(lambda x: re.sub(r'[^a-zA-Z0-9.?\s]', ' ', x))
  df[column] = df[column].apply(lambda x: re.sub(r'-\s*\n\s*', '', x))
  df[column] = df[column].apply(lambda x: re.sub(r'-{2,}', ' ', x))
  df[column] = df[column].apply(lambda x: re.sub(r'_+', ' ', x))
  df[column] = df[column].apply(lambda x: re.sub(r'\n{2,}', '\n\n', x))
  df[column]=df[column].apply(lambda x: re.sub(r'([^\n])\n([^\n])', r'\1 \2',x))
  df[column] = df[column].apply(lambda x:  re.sub(r'[^\x09\x0A\x0D\x20-\x7E]', ' ', x))
  df[column] = df[column].apply(lambda x: re.sub(r"\s+", " ", x))
  print("after cleaning")
  df["word_count"] = df[column].apply(lambda x: len(nltk.word_tokenize(x)))
  df["sentence_count"] = df[column].apply(lambda x: len(nltk.sent_tokenize(x)))
  print(df[["ID", "word_count","sentence_count"]])
  print(df['word_count'].min())
  print(df['word_count'].max())
  print(df['word_count'].mean())
  return df

In [36]:
judg=clean_text(judg,'Judgment')

after cleaning
           ID  word_count  sentence_count
0       id_10        3948             258
1     id_1000        1935              52
2     id_1001        1569              58
3     id_1002        2379              79
4     id_1003        1999              62
...       ...         ...             ...
1195   id_995        8717             311
1196   id_996        9043             348
1197   id_997        1133              33
1198   id_998        2174              82
1199   id_999        2831             102

[1200 rows x 3 columns]
169
149087
8294.020833333334


In [37]:
ref=clean_text(ref,'Summary')

after cleaning
           ID  word_count  sentence_count
0       id_10         749              23
1     id_1000         589              17
2     id_1001         557              17
3     id_1002         444              17
4     id_1003         485              16
...       ...         ...             ...
1195   id_995         773              26
1196   id_996         776              22
1197   id_997         580              18
1198   id_998         463              14
1199   id_999         665              23

[1200 rows x 3 columns]
29
2404
606.7308333333333


In [38]:
all_tokens = judg["Judgment"].apply(lambda x: nltk.word_tokenize(x.lower()))
judg_vocab = set([token for row in all_tokens for token in row])
s_all_tokens = ref["Summary"].apply(lambda x: nltk.word_tokenize(x.lower()))
sum_vocab = set([token for row in s_all_tokens for token in row])

In [39]:
print(len(judg_vocab))
print(len(sum_vocab))

209010
24612


In [40]:
#reliability check of legal pegasus tokenizer for the dataset after cleaning
from transformers import PegasusTokenizer
from nltk.tokenize import word_tokenize
from tqdm import tqdm

tokenizer = PegasusTokenizer.from_pretrained("nsi319/legal-pegasus")


dataset_vocab = set()
for text in list(judg["Judgment"]) + list(ref["Summary"]):
    dataset_vocab.update(word_tokenize(text.lower()))

# Stats
single_token = []
split_token = []

for word in tqdm(dataset_vocab, desc="Checking coverage"):
    tokens = tokenizer.tokenize(word)
    if len(tokens) == 1:
        single_token.append(word)
    else:
        split_token.append((word, tokens))

print("\n Coverage Report after cleaning")
print(f"Total unique words in dataset vocab: {len(dataset_vocab)}")
print(f"Words covered as single token: {len(single_token)} ({len(single_token)/len(dataset_vocab)*100:.2f}%)")
print(f"Words split into multiple tokens: {len(split_token)} ({len(split_token)/len(dataset_vocab)*100:.2f}%)")

print("\nExamples of words split into multiple tokens:")
for w, t in split_token[:20]:
    print(f"{w} -> {t}")

Checking coverage: 100%|██████████| 211315/211315 [00:02<00:00, 85137.99it/s]


 Coverage Report after cleaning
Total unique words in dataset vocab: 211315
Words covered as single token: 23049 (10.91%)
Words split into multiple tokens: 188266 (89.09%)

Examples of words split into multiple tokens:
8216-8217. -> ['▁', '821', '6-8', '217', '.']
besomo -> ['▁be', 's', 'omo']
40page -> ['▁40', 'page']
statedefining -> ['▁state', 'defining']
3157 -> ['▁3', '157']
jigarbhai -> ['▁jig', 'ar', 'bhai']
ummary -> ['▁um', 'mary']
non-arraigning -> ['▁non', '-', 'arra', 'ign', 'ing']
25.04.2002. -> ['▁25', '.', '04.', '2002', '.']
11.12.1996. -> ['▁11.1', '2.', '1996', '.']
181. -> ['▁181', '.']
18.02.2014 -> ['▁18', '.02.201', '4']
:101.no -> ['▁', ':', '101', '.', 'no']
prosecntion -> ['▁prose', 'cn', 'tion']
madhav -> ['▁mad', 'hav']
153. -> ['▁153', '.']
baijal -> ['▁', 'bai', 'jal']
creae66m49en -> ['▁cre', 'ae', '66', 'm', '49', 'en']
136. -> ['▁136', '.']
theogrcencnt -> ['▁the', 'og', 'rc', 'enc', 'nt']





In [41]:
#compression for summary
def compression_fraction(document, summary):
    doc_len = len(word_tokenize(document))
    sum_len = len(word_tokenize(summary))

    if doc_len == 0:
        return None  # avoid division by zero

    compression = 1 - (sum_len / doc_len)
    return compression


In [42]:

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [43]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Define preprocessing
stop_words = set(stopwords.words("english"))

def get_word_freq(text, top_n=10):
    tokens = word_tokenize(text.lower())
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
    return Counter(filtered).most_common(top_n)

# Plot per-document
# for idx, row in judg.iterrows():
#     word_freq = get_word_freq(row["Judgment"], top_n=10)
#     words, freqs = zip(*word_freq) if word_freq else ([], [])

#     plt.figure(figsize=(8,5))
#     plt.bar(words, freqs, color="skyblue")
#     plt.title(f"Top Words in Document {row['ID']}")
#     plt.xlabel("Words")
#     plt.ylabel("Frequency")
#     plt.xticks(rotation=45)
#     plt.show()


In [44]:
from collections import Counter
def extract_sentences(docs):
    all_sents = []
    one_sents=[]
    for doc in docs:
        sentences = nltk.sent_tokenize(doc)

        one_sent=[s for s in sentences if len(s.split()) == 1]
        #print(one_sent)
        one_sents.extend(one_sent)
        all_sents.extend(sentences)
    return all_sents,one_sents

# Collect sentences from all judgments
all_sents,one_sents= extract_sentences(judg["Judgment"])

# Count most common sentences
sent_counter = Counter(all_sents)

# Show top 20
print(sent_counter.most_common(100))


[('2.', 1003), ('3.', 965), ('No.', 864), ('4.', 859), ('5.', 815), ('6.', 731), ('7.', 698), ('8.', 619), ('9.', 604), ('10.', 584), ('.', 560), ('12.', 554), ('11.', 547), ('13.', 507), ('14.', 494), ('15.', 456), ('16.', 439), ('17.', 406), ('18.', 398), ('20.', 374), ('19.', 357), ('22.', 332), ('21.', 325), ('1.', 304), ('23.', 297), ('24.', 287), ('25.', 287), ('28.', 277), ('26.', 276), ('27.', 252), ('29.', 239), ('30.', 228), ('31.', 217), ('32.', 216), ('33.', 214), ('34.', 202), ('35.', 192), ('Explanation.', 189), ('36.', 176), ('37.', 171), ('39.', 168), ('38.', 165), ('ii.', 165), ('42.', 155), ('Sh.', 152), ('41.', 151), ('Mr.', 150), ('40.', 145), ('44.', 140), ('45.', 134), ('43.', 132), ('48.', 127), ('46.', 125), ('47.', 125), ('Nos.', 124), ('iii.', 122), ('50.', 120), ('b.', 118), ('154 of 20 20 Vinod Dua vs. Union of India & Ors.', 117), ('Addl.', 114), ('Case No.', 113), ('49.', 112), ('52.', 111), ('51.', 109), ('53.', 106), ('The Ld.', 102), ('Ld.', 102), ('58.

In [45]:
one_sents

['Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'Lko.',
 'Thru.',
 'Addl.',
 'Secy.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'Lko.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'Lko.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'Lko.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'U.P.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'Lko.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'U.P.',
 'Thru.',
 'Addl.',
 'Secy.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'Lko.',
 'Thru.',
 'Chief/Prin.',
 'Secy.',
 'Lko.',
 'Thru.',
 'Addl.',
 'Secy.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'Thru.',
 'Addl.',
 'Thru.',
 'Addl.',
 'Chief/Prin.',
 'Secy.',
 'Thru.',
 'Addl.',
 'Thru.',
 'Addl.',
 'Thru.',
 'Addl.',
 'Secy.',
 'Thru.',
 'Addl.',
 'U.P.',
 'Lko.',
 'Thru.',
 'Addl.',
 'Secy.',
 'Thru.',
 'Addl.',
 'Chief/Prin.Secy.',
 'Thru.',
 'Addl.',
 'Thru.',
 'Prin./Addl.',
 'Lko.',
 'Thru.',
 'Addl.',
 '/Prin.',
 '

In [46]:
sent_counter=sent_counter.most_common(100)
sent_noise=[s for s in sent_counter if len(s[0].split())<6]

In [47]:
all_sent_noise_items = [s for s, _ in sent_noise]
print(all_sent_noise_items)

['2.', '3.', 'No.', '4.', '5.', '6.', '7.', '8.', '9.', '10.', '.', '12.', '11.', '13.', '14.', '15.', '16.', '17.', '18.', '20.', '19.', '22.', '21.', '1.', '23.', '24.', '25.', '28.', '26.', '27.', '29.', '30.', '31.', '32.', '33.', '34.', '35.', 'Explanation.', '36.', '37.', '39.', '38.', 'ii.', '42.', 'Sh.', '41.', 'Mr.', '40.', '44.', '45.', '43.', '48.', '46.', '47.', 'Nos.', 'iii.', '50.', 'b.', 'Addl.', 'Case No.', '49.', '52.', '51.', '53.', 'The Ld.', 'Ld.', '58.', '57.', '54.', '55.', '64.', 'W.P.', '56.', 'iv.', '61.', '59.', '60.', '(C) No.', '68.', '07/2019 CNR No.', 'page no.', '62.', '69.', '74.', '66.', 'I.A.', '65.', 'II.', '73.', '72.', '78.', '67.', '75.', 'B.', '70.', '63.', '71.', '76.']


In [48]:
all_sent_noise_items.extend(one_sents)


In [50]:
all_sent_noise_items=list(set(all_sent_noise_items))

In [51]:
all_sent_noise_items

['(M:8Ms.',
 '17.7.',
 '10,2023).',
 '475.',
 '9.20.',
 '260.',
 '?Yes.',
 'ol.',
 '275.',
 'Kaleem.',
 '181.',
 '67/2002.',
 '31.15.',
 '4.4.2.',
 '184.3.',
 '153.',
 '11.2.',
 'Stud.',
 '105.',
 '159.Mr.',
 '9.7.',
 '(Smt.)',
 '136.',
 '(9.)',
 '16.)',
 'Exemptions.',
 '111-112.',
 'agriculture.',
 '42.4.',
 '31.14.',
 '32.2.',
 '6.3.Mr.',
 '292.',
 '4.2.3.',
 'u.',
 '257.',
 'W.J.C.',
 '[9].',
 '12.',
 '86.',
 '493/2022].',
 '104.1.',
 '12.17.',
 '64.3.',
 'C.R.A.',
 '8.7.',
 '13(2).',
 '231.',
 '123.',
 '412.',
 '2(b).',
 '(II).',
 'Ajithkumar.',
 '225.',
 '102.]',
 'iv.',
 '20.11.',
 '3.24.',
 'U.D.',
 '284.13.',
 '17.10.',
 'No.159/20218.',
 '10.12.',
 'MH-31/ER-5226.',
 '467.',
 '06.2020.',
 '2746(E).',
 'ii.)',
 'RW1/10.',
 '74.',
 '24.Mr.',
 '7.3.',
 '59.',
 '1196.',
 'Katju.',
 '23(xiv).',
 '1149.].',
 '2.3.2.',
 'Masud.',
 'TT.',
 'Nagpur.',
 '18.5.2.',
 'Estt.',
 '3.25.',
 'viii).',
 '211.',
 '130.Dr.',
 '23.5.',
 'L-28822/2007.',
 'Cr.',
 '86.Mr.',
 '6).',
 '5a.',
 '(15)Co

In [52]:
from collections import Counter
import re

def tokenize_words(text):
    text = text.lower()
    words = re.findall(r'\b[a-z\.]+\b', text)
    return words

all_tokens = []
for doc in judg["Judgment"]:
    all_tokens.extend(tokenize_words(str(doc)))

word_freq = Counter(all_tokens)
print(word_freq.most_common(50))


[('the', 770518), ('of', 443647), ('to', 242120), ('and', 201560), ('in', 189358), ('a', 133255), ('that', 123288), ('is', 123255), ('be', 84235), ('by', 84029), ('for', 77441), ('on', 74268), ('as', 71354), ('or', 65660), ('it', 60262), ('not', 58109), ('was', 55853), ('court', 54963), ('.', 51550), ('this', 46305), ('with', 46083), ('has', 43005), ('which', 42219), ('under', 37371), ('section', 34477), ('any', 33179), ('an', 33145), ('are', 32578), ('act', 32565), ('no', 29299), ('from', 28708), ('state', 27722), ('have', 26799), ('at', 26387), ('case', 25946), ('s', 25907), ('been', 25433), ('such', 25315), ('he', 23983), ('also', 21147), ('would', 19770), ('order', 19602), ('his', 18663), ('may', 17839), ('other', 17480), ('law', 17263), ('shall', 17247), ('there', 17232), ('had', 17020), ('if', 15906)]


In [56]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(judg["Judgment"].astype(str))

avg_tfidf = np.asarray(X.mean(axis=0)).ravel()
terms = vectorizer.get_feature_names_out()

# Sort by lowest average tf-idf → most likely boilerplate
low_tfidf_terms = [t for t, score in sorted(zip(terms, avg_tfidf), key=lambda x: x[1])[:50]]
print("\n🔹 Candidate boilerplate terms (low TF-IDF):")
print(low_tfidf_terms)



🔹 Candidate boilerplate terms (low TF-IDF):
['chakravarthi', 'naroda', 'maharaja', 'fron', '1e', 'accession', 'abstraction', 'recharge', 'ofhis', 'ebix', 'nathuram', 'karkare', 'godse', 'dominion', '124f', 'lgbtq', 'sreekumar', 'informational', 'hin', 'rpa', 'fma', 'dahod', 'spv', 'axykno', 'wheat', '051', 'invalidity', 'nundy', 'thot', 'atthe', 'comparative', 'importer', '342a', 'gulberg', 'irrevocable', 'badge', 'bythe', 'apte', 'philip', 'chronicle', 'shipping', 'stakes', 'oems', 'dargah', 'ond', 'estates', 'pooling', 'signatories', 'jv', 'deccan']


In [58]:
noise_dict = {
    "sentences": all_sent_noise_items,
    "terms": ['fron', 'ebix', 'hin', 'rpa', 'fma', 'spv', 'axykno', 'thot', 'apte','ond', 'jv', 'deccan']
}

import json
with open("noise_candidates.json", "w", encoding="utf-8") as f:
    json.dump(noise_dict, f, indent=2, ensure_ascii=False)

print("\n saved noise_candidates.json")


 saved noise_candidates.json


In [59]:
judg

Unnamed: 0,ID,Judgment,sentence_count,unique_token_count,word_count
0,id_10,Case :- WRIT - C No. - 11383 of 2023 Petitione...,258,725,3948
1,id_1000,Versus Appearance: and Date : 22/09/2023 1.The...,52,459,1935
2,id_1001,Non-Reportable Criminal Appeal No. of 2024 (@S...,58,515,1569
3,id_1002,Appellant Versus Umesh Sharma S/o Late Omprak...,79,680,2379
4,id_1003,1Whether Reporters of Local Papers may be allo...,62,585,1999
...,...,...,...,...,...
1195,id_995,(arising out of Special Leave Petition ( Crimi...,311,1563,8717
1196,id_996,% Reserved on: 15.03.2023 Pronounced on: 16.05...,348,1508,9043
1197,id_997,Reportable Writ Petition (C) No 961 of 2021 Ne...,33,390,1133
1198,id_998,M/s. Biovet Private Limited .Applicants In the...,82,482,2174


In [60]:
ref=clean_text(ref,'Summary')

after cleaning
           ID  word_count  sentence_count
0       id_10         749              23
1     id_1000         589              17
2     id_1001         557              17
3     id_1002         444              17
4     id_1003         485              16
...       ...         ...             ...
1195   id_995         773              26
1196   id_996         776              22
1197   id_997         580              18
1198   id_998         463              14
1199   id_999         665              23

[1200 rows x 3 columns]
29
2404
606.7308333333333


In [None]:
ref

In [61]:
merged_df = judg.merge(ref, on="ID", how="inner")

In [62]:
merged_df = merged_df[['ID','Judgment','Summary']]
merged_df

Unnamed: 0,ID,Judgment,Summary
0,id_10,Case :- WRIT - C No. - 11383 of 2023 Petitione...,The Allahabad High Court on Thursday stayed th...
1,id_1000,Versus Appearance: and Date : 22/09/2023 1.The...,A convict in Gujarat who had secured bail in 2...
2,id_1001,Non-Reportable Criminal Appeal No. of 2024 (@S...,A police officer failing in their fundamental ...
3,id_1002,Appellant Versus Umesh Sharma S/o Late Omprak...,The Chhattisgarh High Court recently observed ...
4,id_1003,1Whether Reporters of Local Papers may be allo...,The Gujarat High Court recently quashed a Firs...
...,...,...,...
1195,id_995,(arising out of Special Leave Petition ( Crimi...,Facts sourced from a statement made by accused...
1196,id_996,% Reserved on: 15.03.2023 Pronounced on: 16.05...,The Delhi High Court recently directed mediati...
1197,id_997,Reportable Writ Petition (C) No 961 of 2021 Ne...,The Supreme Court on Friday passed an interim ...
1198,id_998,M/s. Biovet Private Limited .Applicants In the...,The Bombay High Court on Thursday permitted Bi...


In [63]:
merged_df["compression"] = merged_df.apply(lambda x: compression_fraction(x["Judgment"], x["Summary"]), axis=1)
merged_df

Unnamed: 0,ID,Judgment,Summary,compression
0,id_10,Case :- WRIT - C No. - 11383 of 2023 Petitione...,The Allahabad High Court on Thursday stayed th...,0.810284
1,id_1000,Versus Appearance: and Date : 22/09/2023 1.The...,A convict in Gujarat who had secured bail in 2...,0.695607
2,id_1001,Non-Reportable Criminal Appeal No. of 2024 (@S...,A police officer failing in their fundamental ...,0.644997
3,id_1002,Appellant Versus Umesh Sharma S/o Late Omprak...,The Chhattisgarh High Court recently observed ...,0.813367
4,id_1003,1Whether Reporters of Local Papers may be allo...,The Gujarat High Court recently quashed a Firs...,0.757379
...,...,...,...,...
1195,id_995,(arising out of Special Leave Petition ( Crimi...,Facts sourced from a statement made by accused...,0.911323
1196,id_996,% Reserved on: 15.03.2023 Pronounced on: 16.05...,The Delhi High Court recently directed mediati...,0.914188
1197,id_997,Reportable Writ Petition (C) No 961 of 2021 Ne...,The Supreme Court on Friday passed an interim ...,0.488085
1198,id_998,M/s. Biovet Private Limited .Applicants In the...,The Bombay High Court on Thursday permitted Bi...,0.787029


In [64]:
merged_df['compression'].mean() #for abstractive measure

np.float64(0.7403362091785615)

In [65]:
#for extractive measure
def get_extractive_fragments(doc_tokens, sum_tokens):

    fragments = []
    i = 0
    doc_set = set(doc_tokens)

    while i < len(sum_tokens):
        if sum_tokens[i] in doc_set:
            start = i
            length = 0
            while i + length < len(sum_tokens) and sum_tokens[i + length] in doc_set:
                length += 1
            fragments.append(sum_tokens[start:start+length])
            i += length
        else:
            i += 1
    return fragments

def summary_density(doc, summary):
    doc_tokens = word_tokenize(str(doc).lower())
    sum_tokens = word_tokenize(str(summary).lower())
    sum_len = len(sum_tokens)

    if sum_len == 0:
        return 0

    fragments = get_extractive_fragments(doc_tokens, sum_tokens)
    sum_sq = sum(len(f)**2 for f in fragments)

    density = sum_sq / sum_len
    return density



In [66]:
merged_df["density"] = merged_df.apply(lambda x: summary_density(x["Judgment"], x["Summary"]), axis=1)
merged_df

Unnamed: 0,ID,Judgment,Summary,compression,density
0,id_10,Case :- WRIT - C No. - 11383 of 2023 Petitione...,The Allahabad High Court on Thursday stayed th...,0.810284,25.554813
1,id_1000,Versus Appearance: and Date : 22/09/2023 1.The...,A convict in Gujarat who had secured bail in 2...,0.695607,36.826531
2,id_1001,Non-Reportable Criminal Appeal No. of 2024 (@S...,A police officer failing in their fundamental ...,0.644997,21.707361
3,id_1002,Appellant Versus Umesh Sharma S/o Late Omprak...,The Chhattisgarh High Court recently observed ...,0.813367,22.813063
4,id_1003,1Whether Reporters of Local Papers may be allo...,The Gujarat High Court recently quashed a Firs...,0.757379,18.465979
...,...,...,...,...,...
1195,id_995,(arising out of Special Leave Petition ( Crimi...,Facts sourced from a statement made by accused...,0.911323,31.455369
1196,id_996,% Reserved on: 15.03.2023 Pronounced on: 16.05...,The Delhi High Court recently directed mediati...,0.914188,115.256443
1197,id_997,Reportable Writ Petition (C) No 961 of 2021 Ne...,The Supreme Court on Friday passed an interim ...,0.488085,6.731034
1198,id_998,M/s. Biovet Private Limited .Applicants In the...,The Bombay High Court on Thursday permitted Bi...,0.787029,5.867965


In [67]:
merged_df['density'].mean()

np.float64(34.03351145675436)

In [68]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42, shuffle=True)

In [69]:
import json

def save_jsonl(df, path):
    with open(path, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

# Save to JSONL
# save_jsonl(train_df, "./dataset/train/train_split.jsonl")
# save_jsonl(val_df, "./dataset/train/val_split.jsonl")

# print("Train/Test JSONL files saved!")


In [71]:
print(train_df.shape)
print(val_df.shape)

(960, 5)
(240, 5)


In [72]:
train_df

Unnamed: 0,ID,Judgment,Summary,compression,density
331,id_1498,and W.M.P.(MD)No.21615 of 2022 S.Lawrence Vima...,Designation of a lawyer as a Senior Advocate i...,0.941866,39.807947
409,id_16,This file relates to reconsideration of the pr...,The Union Law Minister told the Supreme Court ...,-0.004491,21.285075
76,id_1106,1PA./RG/Misc./2021 Date-24.06.202L Considering...,The Rajasthan High Court will start regular he...,0.882838,46.866197
868,id_539,IN THE HIGH COURT OF MADHYA PRADESHAT JABALPUR...,The Madhya Pradesh High Court recently quashed...,0.705325,21.982673
138,id_1205,Crl. A. No.1200/2023 Criminal Appeal No.1200 o...,While society should be protected from drug-re...,0.825460,34.735069
...,...,...,...,...,...
1044,id_797,Haji Abdul Gani Khan & Anr. Petitioners v. Uni...,The Supreme Court on Monday dismissed the peti...,0.918290,38.205009
1095,id_863,Khalil Abbas Fakir .. Applicant Tabbasum Khali...,The Bombay High Court recently held that a div...,0.911834,32.074016
1130,id_911,(Criminal Revisional Jurisdiction) Appellate S...,The Calcutta High Court recently quashed a che...,0.845276,17.838710
860,id_527,1 CC.No.30751/2021 Dated this the 18th day of ...,An Special Court in Bengaluru recently convict...,0.920177,26.129758


In [73]:
val_df=load_jsonl('./dataset/train/val_split.jsonl')

In [74]:
val_path='./dataset/validation/val_judg.jsonl'
val=load_jsonl(train_path)

In [75]:
nltk.sent_tokenize(val['Judgment'].iloc[1])

['Versus\nAppearance:\nand\nDate : 22/09/2023\n1.The present case is an eye opener.',
 'The convict-\nChandanji @ Gato Chhanaji Thakor has filed the\npresent application seeking regular bail through\njail.',
 'Such application was filed by him on\n05.08.2023, which is forwarded to the Registry of\nthis Court vide communication dated 11.08.2023\nwritten by the Deputy Superintendent of Ahmedabad\nCentral Jail.',
 '2.When the matter was listed yesterday, learned\nadvocate Mr.Soni appearing for the applicant-convict\nhas invited attention of this Court to the order\ndated 29.09.2020 passed in Criminal Misc.',
 'Application\n(for suspension of sentence) No.1 of 2020 in the\ncaptioned appeal and has submitted that this Court,\nafter passing a comprehensive order, had already\nreleased the applicant on regular bail by suspending\nhis sentence under the provision of Section 389 of\nthe Code of Criminal Procedure, 1973 (for short "the\n3.The matter was ordered to be listed today and\nlearned AP

In [76]:

val_df["word_count"] = val_df["Judgment"].apply(lambda x: len(nltk.word_tokenize(x)))
val_df["sentence_count"] = val_df["Judgment"].apply(lambda x: len(nltk.sent_tokenize(x)))

print(val_df[["ID", "word_count", "sentence_count"]])

          ID  word_count  sentence_count
0     id_972        1224              50
1     id_533        3106             102
2    id_1145         551              27
3    id_1640       12832             483
4    id_1079         803              28
..       ...         ...             ...
235  id_1568        2595             154
236   id_538        3547             127
237  id_1791       39291            1255
238   id_993        7728             278
239   id_546        6075             246

[240 rows x 3 columns]


In [77]:
val_df['Judgment'].isnull().sum()

np.int64(0)

In [78]:
val_df=clean_text(val_df,'Judgment')

after cleaning
          ID  word_count  sentence_count
0     id_972        1224              50
1     id_533        3106             102
2    id_1145         551              27
3    id_1640       12832             483
4    id_1079         803              28
..       ...         ...             ...
235  id_1568        2595             154
236   id_538        3547             127
237  id_1791       39291            1255
238   id_993        7728             278
239   id_546        6075             246

[240 rows x 3 columns]
167
133288
7369.070833333333


In [79]:

val_df

Unnamed: 0,ID,Judgment,Summary,word_count,sentence_count
0,id_972,Through Mr. Hemant Daswani Ms. Sauyma Bajpai A...,The Delhi High Court has upheld a single judge...,1224,50
1,id_533,ORDER Per the Hon ble the Chief Justice Alok A...,The Telangana High Court recently upheld the C...,3106,102
2,id_1145,Vasundhara d o Praful Bhojane Union of India t...,The Bombay High Court today issued notice in a...,551,27
3,id_1640,Reportable High Court Bar Association Allahaba...,A Constitution bench of the Supreme Court on T...,12832,483
4,id_1079,Applicant Phoolchandra Yadav And 2 Others Oppo...,The Allahabad High Court has granted bail to t...,803,28
...,...,...,...,...,...
235,id_1568,Ms. Kangana Ranaut ...Petitioner Versus 1. The...,The Bombay High Court on Friday rejected a pet...,2595,154
236,id_538,Versus Appearance MS URVASHI K MEHTA 11469 for...,A rape is a rape even if it is committed by th...,3547,127
237,id_1791,Reportable Civil Appeal No. 8129 of 2022 Madhy...,The Central government s refusal to renew broa...,39291,1255
238,id_993,Writ Petition Nos.16650 of 2020 and 144 48 of ...,The Madras High Court on Monday upheld the con...,7728,278


In [80]:
val_df = val_df[['ID','Judgment','Summary']]
save_jsonl(val_df, "./dataset/train/val_split2.jsonl")

In [81]:
val_df

Unnamed: 0,ID,Judgment,Summary
0,id_972,Through Mr. Hemant Daswani Ms. Sauyma Bajpai A...,The Delhi High Court has upheld a single judge...
1,id_533,ORDER Per the Hon ble the Chief Justice Alok A...,The Telangana High Court recently upheld the C...
2,id_1145,Vasundhara d o Praful Bhojane Union of India t...,The Bombay High Court today issued notice in a...
3,id_1640,Reportable High Court Bar Association Allahaba...,A Constitution bench of the Supreme Court on T...
4,id_1079,Applicant Phoolchandra Yadav And 2 Others Oppo...,The Allahabad High Court has granted bail to t...
...,...,...,...
235,id_1568,Ms. Kangana Ranaut ...Petitioner Versus 1. The...,The Bombay High Court on Friday rejected a pet...
236,id_538,Versus Appearance MS URVASHI K MEHTA 11469 for...,A rape is a rape even if it is committed by th...
237,id_1791,Reportable Civil Appeal No. 8129 of 2022 Madhy...,The Central government s refusal to renew broa...
238,id_993,Writ Petition Nos.16650 of 2020 and 144 48 of ...,The Madras High Court on Monday upheld the con...


In [84]:
# def split_to_sentences_summ(para):
#     sents = nltk.tokenize.sent_tokenize(para)
#     return sents

# def get_chunks_data_from_docV2(doc, summ):
#     doc_sents = split_to_sentences_summ(doc)
#     summ_sents = split_to_sentences_summ(summ)
#     return len(max(doc_sents)), len(max(summ_sents))
# mlj=0
# mls=0
# for _, row in tqdm(df.iterrows(), total=len(df)):
#          doc_id = str(row["ID"])
#          judgment = row["Judgment"]
#          reference = row["Summary"]
#          dl, sl = get_chunks_data_from_docV2(judgment,reference)
#          print(doc_id, dl)
#         #  if dl>mlj:
#         #    mlj=dl
#         #  if sl>mls:
#         #    mls=sl
