In [1]:
from sentence_transformers import SentenceTransformer, util
from LexRank import degree_centrality_scores
from tqdm import tqdm
import nltk
import pandas as pd
import numpy as np
import torch
import pickle

In [2]:
model_path='/media/marcin/Dane/model/'
# model_path='model/'

In [4]:
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')

[nltk_data] Downloading package punkt to /home/marcin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df = pd.read_csv('parsed/corpus/all.csv', index_col=0, na_filter=False)
docs = df.text.to_list()

In [6]:
model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

In [9]:
docs_failed = []

def embed_doc(doc):
    # Split the document into sentences
    sentences = tokenizer.tokenize(doc)

    # Compute the sentence embeddings
    embeddings = model.encode(sentences, convert_to_tensor=True)

    # Compute the pair-wise cosine similarities
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings).numpy()

    # Compute the centrality for each sentence
    try:
        centrality_scores = degree_centrality_scores(cos_scores, threshold=None)
    except Exception as ex:
        print('Failed to calculate eigenvector - returning whole doc embedding')
        docs_failed.append(doc)
        return model.encode(doc, convert_to_tensor=True)

    # We argsort so that the first element is the sentence with the highest score
    most_central_sentence_indices = np.argsort(-centrality_scores)

    # Return mean embedding of top 5 sentences
    best_embeddings = [e for i, e in enumerate(embeddings) if i in most_central_sentence_indices[:5]]
    
    mean_embedding = torch.mean(torch.stack(best_embeddings), dim=0)

    return mean_embedding

embeddings_all = [embed_doc(doc) for doc in tqdm(docs)]

 66%|██████▌   | 27581/41930 [51:00<20:24, 11.72it/s]

In [None]:
with open(f'{model_path}embeddings-lexrank.pkl', "wb") as fOut:
    pickle.dump(embeddings_all, fOut, protocol=pickle.HIGHEST_PROTOCOL)