In [1]:
import os
import json
import numpy as np
import pandas as pd
import torch
import tqdm

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [2]:
base_path = "./data/raw/"
write_path = "./data/ranked/"

def process(file,sampleCount):
    print(f"Current file:{file}")
    with open(f"{base_path}/{file}") as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    df=df.transpose()
    # Comment below
    df = df.head(sampleCount)
    # Comment above
    
    ranked_sentences = []
    for i, row in tqdm.notebook.tqdm(df.iterrows(),total=len(df)):
        src_sents=[]
        paras = df.loc[i]["segments"]['facts-and-arguments']
        for para in paras:
            sent = para.split('।')
            sent = [i for i in sent if len(i)!=0 and i!=' ']
            src_sents.extend(sent)
        s1=src_sents
        s2 = [" ".join(df.loc[i]["segments"]['judge-opinion'])]
        e1 = model.encode(s1)
        e2 = model.encode(s2)
        cos_sim = util.cos_sim(e2, e1)
        s1 = np.asarray(s1)
        ranks=s1[np.argsort(-1*cos_sim[0].numpy())]
        ranked_sentences.append(ranks.tolist())

    df = df.head(len(ranked_sentences))
    df['ranked-sentences'] = ranked_sentences
    df['ranked-sentences'].map(len)
    file = file.replace(".json",".csv")
    df.to_csv(f"{write_path}/{file}")

files = os.listdir(base_path)
try:
    os.mkdir(write_path)
except:
    pass
files = [f for f in files if ".json" in f]

# Test, Train, Val split
sampleCount = [100,1000,100]
for file,count in zip(files,sampleCount):
    process(file,count)


Current file:test.json


  0%|          | 0/100 [00:00<?, ?it/s]

Current file:train.json


  0%|          | 0/1000 [00:00<?, ?it/s]

Current file:val.json


  0%|          | 0/100 [00:00<?, ?it/s]