In [7]:
import os 
import pandas as pd

input_path="./data/ranked/"
output_path="./data/summaries_kmeans/"
if os.path.exists(output_path) == False:
    os.mkdir(output_path)

def clean_text(row):
    text = []
    [text.extend(i.strip().split('।')) for i in row]
    text = [i.strip() for i in text]
    text = list(filter(None, text))
    return text



def get_sentences():
    train = pd.read_csv(f'{input_path}train.csv')
    test = pd.read_csv(f'{input_path}test.csv')
    val = pd.read_csv(f'{input_path}val.csv')
    #Comment
    train = train.head(1)
    test = test.head(1)
    val = val.head(1)
    #Comment

    train['segments'] = train['segments'].apply(eval)
    test['segments'] = test['segments'].apply(eval)
    val['segments'] = val['segments'].apply(eval)

    train_sentences = train['segments'].apply(lambda x: clean_text(x['facts-and-arguments']))
    test_sentences = test['segments'].apply(lambda x: clean_text(x['facts-and-arguments']))
    val_sentences = val['segments'].apply(lambda x: clean_text(x['facts-and-arguments']))

    return train,test,val,train_sentences, val_sentences, test_sentences

train,test,val,train_sentences, val_sentences, test_sentences = get_sentences()

In [8]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans


def summarize(sentences):
    embeddings = model.encode(sentences)
    k=len(sentences)//4
    kmeans = KMeans(n_clusters=k, n_init=k,random_state=0).fit(embeddings)
    cluster_centers = kmeans.cluster_centers_
    cluster_indices = kmeans.predict(embeddings)

    summary_sentences = []
    for i in range(k):
        cluster = [sentences[j] for j in range(len(sentences)) if cluster_indices[j] == i]
        cluster_embeddings = [embeddings[j] for j in range(len(sentences)) if cluster_indices[j] == i]
        centroid = cluster_centers[i]
        closest_sentence_idx = min(range(len(cluster_embeddings)), key=lambda x: cosine_similarity([centroid], [cluster_embeddings[x]]))
        summary_sentences.append(cluster[closest_sentence_idx])
    return summary_sentences

train_summary = []
val_summary = []
test_summary = []


for i in range(len(train_sentences)):
    train_summary.append(summarize(train_sentences[i]))

for i in range(len(val_sentences)):
    val_summary.append(summarize(val_sentences[i]))

for i in range(len(test_sentences)):
    test_summary.append(summarize(test_sentences[i]))

train['summary'] = train_summary
val['summary'] = val_summary
test['summary'] = test_summary
train.to_csv(f'{output_path}train.csv', index=False)
val.to_csv(f'{output_path}val.csv', index=False)
test.to_csv(f'{output_path}test.csv', index=False)
