In [3]:
%cd /content/drive/MyDrive/justnlp/summarization

/content/drive/MyDrive/justnlp/summarization


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Train Data

In [42]:
import json
import pandas as pd
def load_jsonl(path):
    records = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line))
    return pd.DataFrame(records)

train_path='./dataset/train/train_judg.jsonl'
judg=load_jsonl(train_path)

In [5]:
judg.head()

Unnamed: 0,ID,Judgment
0,id_10,Case :- WRIT - C No. - 11383 of 2023\nPetition...
1,id_1000,Versus\nAppearance:\nand\nDate : 22/09/2023\n1...
2,id_1001,Non-Reportable\nCriminal Appeal No._________ o...
3,id_1002,---- Appellant\nVersus\nUmesh Sharma S/o Late...
4,id_1003,1Whether Reporters of Local Papers may be allo...


In [6]:
judg.shape

(1200, 2)

In [7]:
import nltk
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
judg["word_count"] = judg["Judgment"].apply(lambda x: len(nltk.word_tokenize(x)))
judg["sentence_count"] = judg["Judgment"].apply(lambda x: len(nltk.sent_tokenize(x)))
print(judg[["ID", "word_count", "sentence_count"]])

           ID  word_count  sentence_count
0       id_10        3949             258
1     id_1000        1938              51
2     id_1001        1574              56
3     id_1002        2386              79
4     id_1003        2008              60
...       ...         ...             ...
1195   id_995        8844             309
1196   id_996        9094             346
1197   id_997        1135              33
1198   id_998        2194              80
1199   id_999        2854             100

[1200 rows x 3 columns]


In [9]:
judg['word_count'].min()

169

In [10]:
judg['word_count'].max()

149745

In [11]:
judg['word_count'].mean()

np.float64(8368.1275)

In [12]:
judg['Judgment'].isnull().sum()

np.int64(0)

In [22]:
print(judg['sentence_count'].min())
print(judg['sentence_count'].max())
print(judg['sentence_count'].mean())

1
5690
284.4691666666667


In [37]:
import re
judg["clean_text"] = judg["Judgment"].str.lower()
judg['clean_text']=judg['clean_text'].apply(lambda x: re.sub(r'[^a-z0-9\.\?\\n\s]', '', x))
judg["clean_text"] = judg["clean_text"].apply(lambda x: re.sub(r"\s+", " ", x))
print(judg[["ID", "clean_text"]])
print("after cleaning")
judg["word_count"] = judg["clean_text"].apply(lambda x: len(nltk.word_tokenize(x)))
judg["sentence_count"] = judg["clean_text"].apply(lambda x: len(nltk.sent_tokenize(x)))
print(judg[["ID", "word_count","sentence_count"]])
print(judg['word_count'].min())
print(judg['word_count'].max())
print(judg['word_count'].mean())

           ID                                         clean_text
0       id_10  case writ c no. 11383 of 2023 petitioner syed ...
1     id_1000  versus appearance and date 22092023 1.the pres...
2     id_1001  nonreportable criminal appeal no. of 2024 spec...
3     id_1002   appellant versus umesh sharma so late ompraka...
4     id_1003  1whether reporters of local papers may be allo...
...       ...                                                ...
1195   id_995  arising out of special leave petition criminal...
1196   id_996   reserved on 15.03.2023 pronounced on 16.05.20...
1197   id_997  reportable writ petition c no 961 of 2021 neil...
1198   id_998  ms. biovet private limited .applicants in the ...
1199   id_999  nc 2023khcd13177db wa no. 100406 of 2023 corre...

[1200 rows x 2 columns]
after cleaning
           ID  word_count  sentence_count
0       id_10        3374             261
1     id_1000        1829              52
2     id_1001        1449              56
3     id_100

In [39]:
nltk.sent_tokenize(judg['clean_text'].iloc[0])

['case writ c no.',
 '11383 of 2023 petitioner syed hamidul bari respondent state of u.p.',
 'thru.',
 'addl.',
 'chiefprin.',
 'secy.',
 'housing and urban planning deptt.',
 'lko.',
 'and 4 others counsel for petitioner kazim ibrahim amrit khare counsel for respondent c.s.c.',
 'ratnesh chandra case writ c no.',
 '11360 of 2023 petitioner mohd.',
 'naushad respondent state of u.p.',
 'thru.',
 'addl.',
 'chief secy.prin.',
 'secy.',
 'housing and urban planning deptt.',
 'and 4 others counsel for petitioner kazim ibrahimamrit khare counsel for respondent c.s.c.ratnesh chandra case writ c no.',
 '11362 of 2023 petitioner mohammad abrar respondent state of u.p.',
 'thru.',
 'addl.',
 'chiefprin.',
 'secy.',
 'housing urban planning deptt.',
 'lko.',
 'and 4 others counsel for petitioner kazim ibrahimamrit khare counsel for respondent c.s.c.ratnesh chandra case writ c no.',
 '11368 of 2023 petitioner mohammad saif khan respondent state of u.p.',
 'thru.',
 'addl.',
 'chiefprin.',
 'secy

In [43]:
train_ref_path='./dataset/train/train_ref_summ.jsonl'
ref=load_jsonl(train_ref_path)

In [15]:
ref.head()

Unnamed: 0,ID,Summary
0,id_10,The Allahabad High Court on Thursday stayed th...
1,id_1000,A convict in Gujarat who had secured bail in 2...
2,id_1001,A police officer failing in their fundamental ...
3,id_1002,The Chhattisgarh High Court recently observed ...
4,id_1003,The Gujarat High Court recently quashed a Firs...


In [17]:
ref.shape

(1200, 2)

In [41]:
ref["word_count"] = ref["Summary"].apply(lambda x: len(nltk.word_tokenize(x)))
ref["sentence_count"] = ref["Summary"].apply(lambda x: len(nltk.sent_tokenize(x)))

print(ref[["ID", "word_count", "sentence_count"]])

           ID  word_count  sentence_count
0       id_10         759              22
1     id_1000         595              17
2     id_1001         557              17
3     id_1002         444              17
4     id_1003         486              16
...       ...         ...             ...
1195   id_995         775              26
1196   id_996         790              20
1197   id_997         580              18
1198   id_998         463              14
1199   id_999         679              23

[1200 rows x 3 columns]


In [20]:
print(ref['word_count'].min())
print(ref['word_count'].max())
print(ref['word_count'].mean())

29
2412
609.9591666666666


In [21]:
print(ref['sentence_count'].min())
print(ref['sentence_count'].max())
print(ref['sentence_count'].mean())

1
73
19.948333333333334


In [23]:
ref['Summary'].isnull().sum()

np.int64(0)

In [24]:
import re
ref["clean_text"] = ref["Summary"].str.lower()
ref["clean_text"] = ref["clean_text"].apply(lambda x: re.sub(r"\s+", " ", x))
print(ref[["ID", "clean_text"]])

           ID                                         clean_text
0       id_10  the allahabad high court on thursday stayed th...
1     id_1000  a convict in gujarat who had secured bail in 2...
2     id_1001  a police officer failing in their fundamental ...
3     id_1002  the chhattisgarh high court recently observed ...
4     id_1003  the gujarat high court recently quashed a firs...
...       ...                                                ...
1195   id_995  facts sourced from a statement made by accused...
1196   id_996  the delhi high court recently directed mediati...
1197   id_997  the supreme court on friday passed an interim ...
1198   id_998  the bombay high court on thursday permitted bi...
1199   id_999  the karnataka high court recently set aside a ...

[1200 rows x 2 columns]


In [44]:
judg

Unnamed: 0,ID,Judgment
0,id_10,Case :- WRIT - C No. - 11383 of 2023\nPetition...
1,id_1000,Versus\nAppearance:\nand\nDate : 22/09/2023\n1...
2,id_1001,Non-Reportable\nCriminal Appeal No._________ o...
3,id_1002,---- Appellant\nVersus\nUmesh Sharma S/o Late...
4,id_1003,1Whether Reporters of Local Papers may be allo...
...,...,...
1195,id_995,(arising out of Special Leave Petition ( Crimi...
1196,id_996,% Reserved on: 15.03.2023\nPronounced on: 16.0...
1197,id_997,Reportable\nWrit Petition (C) No 961 of 2021\n...
1198,id_998,M/s. Biovet Private Limited ….Applicants\nIn t...


In [45]:
ref

Unnamed: 0,ID,Summary
0,id_10,The Allahabad High Court on Thursday stayed th...
1,id_1000,A convict in Gujarat who had secured bail in 2...
2,id_1001,A police officer failing in their fundamental ...
3,id_1002,The Chhattisgarh High Court recently observed ...
4,id_1003,The Gujarat High Court recently quashed a Firs...
...,...,...
1195,id_995,Facts sourced from a statement made by accused...
1196,id_996,The Delhi High Court recently directed mediati...
1197,id_997,The Supreme Court on Friday passed an interim ...
1198,id_998,The Bombay High Court on Thursday permitted Bi...


In [46]:
merged_df = judg.merge(ref, on="ID", how="inner")

In [47]:
merged_df

Unnamed: 0,ID,Judgment,Summary
0,id_10,Case :- WRIT - C No. - 11383 of 2023\nPetition...,The Allahabad High Court on Thursday stayed th...
1,id_1000,Versus\nAppearance:\nand\nDate : 22/09/2023\n1...,A convict in Gujarat who had secured bail in 2...
2,id_1001,Non-Reportable\nCriminal Appeal No._________ o...,A police officer failing in their fundamental ...
3,id_1002,---- Appellant\nVersus\nUmesh Sharma S/o Late...,The Chhattisgarh High Court recently observed ...
4,id_1003,1Whether Reporters of Local Papers may be allo...,The Gujarat High Court recently quashed a Firs...
...,...,...,...
1195,id_995,(arising out of Special Leave Petition ( Crimi...,Facts sourced from a statement made by accused...
1196,id_996,% Reserved on: 15.03.2023\nPronounced on: 16.0...,The Delhi High Court recently directed mediati...
1197,id_997,Reportable\nWrit Petition (C) No 961 of 2021\n...,The Supreme Court on Friday passed an interim ...
1198,id_998,M/s. Biovet Private Limited ….Applicants\nIn t...,The Bombay High Court on Thursday permitted Bi...


In [48]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42, shuffle=True)

In [50]:
import json

def save_jsonl(df, path):
    with open(path, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

# Save to JSONL
save_jsonl(train_df, "./dataset/train/train_split.jsonl")
save_jsonl(val_df, "./dataset/train/val_split.jsonl")

print("Train/Test JSONL files saved!")


Train/Test JSONL files saved!


In [51]:
print(train_df.shape)
print(val_df.shape)

(960, 3)
(240, 3)


In [52]:
train_df

Unnamed: 0,ID,Judgment,Summary
331,id_1498,and\nW.M.P.(MD)No.21615 of 2022\nS.Lawrence Vi...,Designation of a lawyer as a Senior Advocate i...
409,id_16,This file relates to reconsideration of the pr...,The Union Law Minister told the Supreme Court ...
76,id_1106,1PA./RG/Misc./2021 Date-24.06.202L\nConsiderin...,The Rajasthan High Court will start regular he...
868,id_539,IN THE HIGH COURT OF MADHYA PRADESHAT JABALPUR...,The Madhya Pradesh High Court recently quashed...
138,id_1205,Crl. A. No.1200/2023\nCriminal Appeal No.1200 ...,While society should be protected from drug-re...
...,...,...,...
1044,id_797,Haji Abdul Gani Khan & Anr. …Petitioners\nv.\n...,The Supreme Court on Monday dismissed the peti...
1095,id_863,Khalil Abbas Fakir ….. Applicant\nTabbasum Kha...,The Bombay High Court recently held that a div...
1130,id_911,(Criminal Revisional Jurisdiction)\nAppellate ...,The Calcutta High Court recently quashed a che...
860,id_527,1 CC.No.30751/2021\nDated this the 18th day of...,An Special Court in Bengaluru recently convict...


In [53]:
val_df

Unnamed: 0,ID,Judgment,Summary
1178,id_972,"Through: Mr. Hemant Daswani, Ms.\nSauyma Bajpa...",The Delhi High Court has upheld a single-judge...
865,id_533,ORDER : (Per the Hon’ble the Chief Justice Alo...,The Telangana High Court recently upheld the C...
101,id_1145,"Vasundhara d/o Praful Bhojane\nUnion of India,...",The Bombay High Court today issued notice in a...
439,id_1640,"Reportable\nHigh Court Bar Association, Allaha...",A Constitution bench of the Supreme Court on T...
58,id_1079,Applicant :- Phoolchandra Yadav And 2 Others\n...,The Allahabad High Court has granted bail to t...
...,...,...,...
382,id_1568,Ms. Kangana Ranaut ...Petitioner\nVersus\n1. T...,The Bombay High Court on Friday rejected a pet...
867,id_538,Versus\nAppearance:\nMS URVASHI K MEHTA(11469)...,A rape is a rape even if it is committed by th...
542,id_1791,Reportable\nCivil Appeal No. 8129 of 2022\nMad...,The Central government's refusal to renew broa...
1193,id_993,Writ Petition Nos.16650 of 2020\nand 144 48 of...,The Madras High Court on Monday upheld the con...
