In [1]:
import os 
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

from sentence_transformers import SentenceTransformer, util
modelHindi = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
modelEnglish = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
from deep_translator import GoogleTranslator

In [2]:
def summarize_kmeans(sentences,hindi=False):
    if hindi:
        model=modelHindi
    else:
        model=modelEnglish
    embeddings = [model.encode(sentence) for sentence in sentences]
    k=max(1,len(sentences)//4)
    kmeans = KMeans(n_clusters=k, n_init=k,random_state=0).fit(embeddings)
    cluster_centers = kmeans.cluster_centers_
    cluster_indices = kmeans.predict(embeddings)

    summary_sentences = []
    for i in range(k):
        cluster = [sentences[j] for j in range(len(sentences)) if cluster_indices[j] == i]
        cluster_embeddings = [embeddings[j] for j in range(len(sentences)) if cluster_indices[j] == i]
        centroid = cluster_centers[i]
        closest_sentence_idx = min(range(len(cluster_embeddings)), key=lambda x: cosine_similarity([centroid], [cluster_embeddings[x]]))
        summary_sentences.append(cluster[closest_sentence_idx])
    return summary_sentences

In [3]:
def split_long_sentence(sentence, max_length=3000):
    parts = []
    for i in range(0, len(sentence), max_length):
        part = sentence[i:i + max_length]
        parts.append(part)

    return parts

def convertToHindi(sentences):
    hindi = []
    for sentence_array in sentences:
        sentence = ''
        hindiText=''
        for i in range(len(sentence_array)):
            sentence = sentence + sentence_array[i]
        if len(sentence)>2000:
            small_sentences = split_long_sentence(sentence, max_length=2000)
            hindiPart = ''
            for i in range(len(small_sentences)):
                current_translate = GoogleTranslator(source='en', target='hi').translate(small_sentences[i])
                hindiPart = hindiPart + current_translate
        else:
            hindiPart = GoogleTranslator(source='en', target='hi').translate(sentence)
        hindiText=hindiText + hindiPart
        hindi.append(hindiText)
    return hindi

In [4]:
input_path="./data/summary/"
output_path="./data/summary_results/"
if os.path.exists(output_path) == False:
    os.mkdir(output_path)

data = pd.read_csv(f'{input_path}data.csv')
sentences = data['sentences'].apply(eval)
sentences_english = data['sentences_english'].apply(eval)
summary=[]
summary_english=[]
for i in tqdm(range(len(sentences))):
    summary.append(summarize_kmeans(sentences[i],hindi=True))
    summary_english.append(convertToHindi(summarize_kmeans(sentences_english[i],hindi=False)))

data['summary'] = summary
data['summary_english'] = summary_english
data.to_csv(f'{output_path}kmeans.csv', index=False)

  0%|          | 0/1000 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
