In [1]:
import json
import pandas as pd
def load_jsonl(path):
    records = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line))
    return pd.DataFrame(records)


def save_jsonl(df, path):
    with open(path, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

In [31]:

train_path='train/updated_train_gpt.jsonl'
judg=load_jsonl(train_path)

In [32]:
judg

Unnamed: 0,ID,para_id,newJudgement
0,id_10,0,"Case: Writ Petition No. 11383 of 2023, Petitio..."
1,id_1000,0,"Versus Appearance: and Date : 22/09/2023"", ..."
2,id_1001,0,Non-Reportable Criminal Appeal No. of 2024 (Sp...
3,id_1002,0,"Appellant: Umesh Sharma, son of Late Omprakash..."
4,id_1003,0,This application is filed under Section 482 of...
...,...,...,...
2498,id_996,0,By way of the present petition filed under Sec...
2499,id_996,1,"However, before parting with this case, the De..."
2500,id_997,0,Reportable Writ Petition (Civil) No 961 of 202...
2501,id_998,0,"Messrs. Biovet Private Limited, Applicants, in..."


In [33]:
para_judg = judg.groupby("ID", as_index=False)["newJudgement"].apply( lambda x: "\n ".join(x))#


In [38]:
para_judg["newJudgement"] = para_judg["newJudgement"].astype(str).str.replace('"', '', regex=False)

In [39]:
para_judg

Unnamed: 0,ID,newJudgement
0,id_10,"Case: Writ Petition No. 11383 of 2023, Petitio..."
1,id_1000,"Versus Appearance: and Date : 22/09/2023, T..."
2,id_1001,Non-Reportable Criminal Appeal No. of 2024 (Sp...
3,id_1002,"Appellant: Umesh Sharma, son of Late Omprakash..."
4,id_1003,This application is filed under Section 482 of...
...,...,...
1195,id_995,Arising out of Special Leave Petition (Crimina...
1196,id_996,By way of the present petition filed under Sec...
1197,id_997,Reportable Writ Petition (Civil) No 961 of 202...
1198,id_998,"Messrs. Biovet Private Limited, Applicants, in..."


In [40]:
para_judg['newJudgement'].iloc[0]

"Case: Writ Petition No. 11383 of 2023, Petitioner: Syed Hamidul Bari, Respondent: State of Uttar Pradesh through Additional Chief/Principal Secretary, Housing and Urban Planning Department, Lucknow, and four others. Counsel for Petitioner: Kazim Ibrahim, Amrit Khare. Counsel for Respondent: Chief Standing Counsel, Ratnesh Chandra.,  Case: Writ Petition No. 11360 of 2023, Petitioner: Mohd. Naushad, Respondent: State of Uttar Pradesh through Additional Chief Secretary/Principal Secretary, Housing and Urban Planning Department, and four others. Counsel for Petitioner: Kazim Ibrahim, Amrit Khare. Counsel for Respondent: Chief Standing Counsel, Ratnesh Chandra.,  Case: Writ Petition No. 11362 of 2023, Petitioner: Mohammad Abrar, Respondent: State of Uttar Pradesh through Additional Chief/Principal Secretary, Housing and Urban Planning Department, Lucknow, and four others. Counsel for Petitioner: Kazim Ibrahim, Amrit Khare. Counsel for Respondent: Chief Standing Counsel, Ratnesh Chandra.,  

In [6]:
from transformers import BertTokenizer
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm





In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
nlp = spacy.load("en_core_web_sm")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [8]:
tokens_for_sentence = {}
def tokenize(text):
    if text not in tokens_for_sentence:
        tokens_for_sentence[text] = tokenizer.tokenize(text)

    return tokens_for_sentence[text]

In [9]:
def split_into_sentences(text):
    """
    Splits the text into sentences using spaCy's sentence boundary detection.
    """
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

In [20]:
def estimate_parts(tokens, max_length=600):
    """
    Estimate the number of parts needed based on the total token count.
    """
    return max(1, -(-len(tokens) // max_length))

In [21]:
def get_sentence_from_token(sentence_tokens):
    token_ids = tokenizer.convert_tokens_to_ids(sentence_tokens)
    decoded_text = tokenizer.decode(token_ids)
    return decoded_text

In [22]:
def split_paragraph(paragraph, max_length=600, last=False):
    """
    Split a paragraph into multiple parts, each as close to equal length as possible,
    without exceeding max_length tokens, and breaking at sentence ends.
    """
    sentences = split_into_sentences(paragraph)
    all_tokens = [token for sentence in sentences for token in tokenize(sentence)]
    if(len(all_tokens)<256 and not last):
        return None
    num_parts = estimate_parts(all_tokens, max_length)

    parts = []
    current_part_tokens = []
    current_token_count = 0

    for sentence in sentences:
        sentence_tokens = tokenize(sentence)

        if current_token_count + len(sentence_tokens) > (len(all_tokens) // num_parts):
            parts.append(current_part_tokens)
            current_part_tokens = []
            current_part_tokens.append(get_sentence_from_token(sentence_tokens))
            current_token_count = len(sentence_tokens)
        else:
            current_part_tokens.append(get_sentence_from_token(sentence_tokens))
            current_token_count += len(sentence_tokens)
    # Add the last part if it's not empty
    if current_part_tokens and (not parts or parts[-1] != current_part_tokens):
        parts.append(current_part_tokens)
    
    return parts

In [28]:
def process_text(text):
    """
    Process the entire text, splitting it into paragraphs and further splitting each paragraph.
    """
    paragraphs = text.split('\n')
    processed_paragraphs = []
    last = False
    for index, paragraph in enumerate(paragraphs):
        if not paragraph.strip():
            continue
        if index == len(paragraphs) - 1:
            last = True
        processed_paragraph = split_paragraph(paragraph,600,last)
        if processed_paragraph is None:
            paragraphs[index+1] = paragraph + "\n" + paragraphs[index+1]
            continue
        processed_paragraphs.extend(processed_paragraph)
    return processed_paragraphs

In [60]:
p=process_text(para_judg['newJudgement'].iloc[0])

In [59]:
def clean_processed_text(p_list):
    cleaned = []
    for p in p_list:
        cleaned_paragraph = []
        for s in p:
            s = str(s).replace('.,', '.')  # replace inside string
            s = s.strip()                  # optional: remove extra spaces
            cleaned_paragraph.append(s)
        cleaned.append(cleaned_paragraph)
    return cleaned

In [61]:
p_list=clean_processed_text(p)

In [64]:
p_list[0]

['case : writ petition no. 11383 of 2023, petitioner : syed hamidul bari, respondent : state of uttar pradesh through additional chief / principal secretary, housing and urban planning department, lucknow, and four others.',
 'counsel for petitioner : kazim ibrahim, amrit khare.',
 'counsel for respondent : chief standing counsel, ratnesh chandra.',
 'case : writ petition no. 11360 of 2023, petitioner : mohd.',
 'naushad, respondent : state of uttar pradesh through additional chief secretary / principal secretary, housing and urban planning department, and four others.',
 'counsel for petitioner : kazim ibrahim, amrit khare.',
 'counsel for respondent : chief standing counsel, ratnesh chandra.',
 'case : writ petition no. 11362 of 2023, petitioner : mohammad abrar, respondent : state of uttar pradesh through additional chief / principal secretary, housing and urban planning department, lucknow, and four others.',
 'counsel for petitioner : kazim ibrahim, amrit khare.',
 'counsel for re

In [81]:
len(p_list)

9

In [75]:
summ=load_jsonl('train/train_ref_summ.jsonl')

In [76]:
summ

Unnamed: 0,ID,Summary
0,id_10,The Allahabad High Court on Thursday stayed th...
1,id_1000,A convict in Gujarat who had secured bail in 2...
2,id_1001,A police officer failing in their fundamental ...
3,id_1002,The Chhattisgarh High Court recently observed ...
4,id_1003,The Gujarat High Court recently quashed a Firs...
...,...,...
1195,id_995,Facts sourced from a statement made by accused...
1196,id_996,The Delhi High Court recently directed mediati...
1197,id_997,The Supreme Court on Friday passed an interim ...
1198,id_998,The Bombay High Court on Thursday permitted Bi...


In [77]:
s=summ['Summary'].iloc[0]

In [78]:
sent=split_into_sentences(s)

In [80]:
len(sent)

23

In [83]:


import torch
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
model = SentenceTransformer('all-MiniLM-L6-v2')
model.to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [84]:
def encode_texts(model, texts):
    """
    Encode a list of texts using the provided model.

    :param model: Loaded model.
    :param texts: List of texts (sentences or passages) to encode.
    :return: List of encoded embeddings.
    """
    return model.encode(texts)

In [85]:
def find_most_relevant_passage_sentence_level(model, input_sentence, passages):
    """
    Find the most relevant passage for the given sentence, comparing at the sentence level.

    :param model: Loaded model.
    :param input_sentence: Input sentence for which to find relevant passage.
    :param passages: List of passages, each being a list of sentences.
    :return: Most relevant passage.
    """
    sentence_embedding = encode_texts(model, [input_sentence])

    highest_similarity = -1
    second_highest_similarity = -1  # Initialize to a low value
    third_highest_similarity = -1
    
    
    most_relevant_passage_index = -1
    second_most_relevant_passage_index = -1
    third_most_relevant_passage_index = -1
    
    all_passages = []
    
    # Iterate over each passage
    for i, passage in enumerate(passages):
        if passage in all_passages:
            print("------------------duplicate-----------------")
            continue
        all_passages.append(passage)
        
        # Encode each sentence in the passage
        passage_embeddings = encode_texts(model, passage)

        # Calculate similarities for each sentence in the passage
        similarities = cosine_similarity(sentence_embedding, passage_embeddings)

        # Find the highest similarity score in this passage
        max_similarity_in_passage = similarities.max()
        # Check if this passage contains the most similar sentence so far
        if max_similarity_in_passage > highest_similarity:
            third_highest_similarity = second_highest_similarity
            second_highest_similarity = highest_similarity
            highest_similarity = max_similarity_in_passage
    
            third_most_relevant_passage_index = second_most_relevant_passage_index
            second_most_relevant_passage_index = most_relevant_passage_index
            most_relevant_passage_index = i
            
        elif max_similarity_in_passage > second_highest_similarity:
            third_highest_similarity = second_highest_similarity
            second_highest_similarity = max_similarity_in_passage
            third_most_relevant_passage_index = second_most_relevant_passage_index
            second_most_relevant_passage_index = i
            
        elif max_similarity_in_passage > third_highest_similarity:
            third_highest_similarity = max_similarity_in_passage
            third_most_relevant_passage_index = i
    similarity_dict = {
        'highest_similarity': highest_similarity,
        'second_highest_similarity': second_highest_similarity,
        'third_highest_similarity': third_highest_similarity,
        'most_relevant_passage': " ".join(passages[most_relevant_passage_index]),
        'second_most_relevant_passage': " ".join(passages[second_most_relevant_passage_index]),
        'third_most_relevant_passage': " ".join(passages[third_most_relevant_passage_index]),
    }
    return similarity_dict

In [89]:
final=[]
for x in sent:
    sim_dic=find_most_relevant_passage_sentence_level(model, x, p_list)
    tosave = {"paper_id":10, "summary_sentence": x}
    tosave.update(sim_dic)
    final.append(tosave)

In [90]:
len(final)

23

In [91]:
final

[{'paper_id': 10,
  'summary_sentence': 'The Allahabad High Court on Thursday stayed the Lucknow Development Authority (LDA)’s decision to carry out large scale demolitions in Akbar Nagar-I and II areas and directed LDA to first initiate the resettlement process for those going to be affected [Syed Hamidul Bari v. State of UP].',
  'highest_similarity': np.float32(0.64394534),
  'second_highest_similarity': np.float32(0.5752126),
  'third_highest_similarity': np.float32(0.5037377),
  'most_relevant_passage': 'at this stage, it is not clear as to what is the tearing hurry in which huge occupations by the relatively poor class of persons are being proposed to be demolished forthwith without even waiting for the scheme of relocating the adversely affected persons being implemented in letter and spirit and also exposing the poorest of the poor to the ensuing harsh winters. as prima facie, the rights flowing from article 21 of the constitution of india, which includes the right to earn live