In [3]:
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers import models, losses
import pandas as pd
from sentence_splitter import SentenceSplitter, split_text_into_sentences
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from scipy.spatial import distance
from scipy.spatial.distance import cdist
import time
import numpy as np
import pickle
import os
from tqdm.auto import tqdm
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
#new_model_name = "./Data/Demagog/Models/d-bert-2021-11-27-12-09-46"
base_model_name = "all-mpnet-base-v2"           #"./Data/Bert/fine-tuned-bert-2022-02-21-16-20-51"   #"all-mpnet-base-v2"          #"paraphrase-multilingual-mpnet-base-v2" all-mpnet-base-v2
data_import_path = "./Data/Preprocessed/data_with_filled_explanations_17.2.2022_no_duplicates_preprocessed.pickle"
source_col_name = "explanation_prep"  #"explanation_prep"  "statement_explanation_prep"                     #"source_text_shorter"  # "source_text_shorter" source_text
targer_col_name = "shortExplanation_prep"                         #"target_text"
neighbours = 4
data_with_similarity_dir = "./Data/Similarity/data_{}_{}.pickle".format(source_col_name,neighbours)
data_embeddings_dir = "./Data/Embeddings/embed_{}_{}.pkl".format(source_col_name, targer_col_name)
min_no_sentence_source_text = 3 

In [6]:
torch.cuda.get_device_name("cuda:0")

'A100-SXM4-40GB'

In [7]:
def add_multi_index(lst):
    return list(range(lst))

def select_longer_claims(df):
    
    #to_del = df[(df.type == "train") & (df.source_text_sentences_len <= min_no_sentence_source_text)].id.to_list()
    #df = df[~ df.id.isin(to_del)]
    df = df.explode(['source_text_sentences',"source_text_sentences_index"])
    
    return df

def split_data(dat, sour_col, targ_col):  
    
    dat["label"] = list(range(len(dat)))
    dat['source_text'] = dat[sour_col]
    dat['target_text'] = dat[targ_col]

    splitter = SentenceSplitter(language='en')
    dat['source_text_sentences'] = dat['source_text'].apply(lambda x : splitter.split(text = x))
    dat['source_text_sentences_len'] = dat['source_text_sentences'].str.len()
    dat['source_text_sentences_index'] = dat['source_text_sentences_len'].apply(lambda x : add_multi_index(x))

    #dat = dat[["id","source_text", "target_text", "source_text_sentences", "source_text_sentences_len","source_text_sentences_index","type"]]
    return dat


def embeddings_sentence_bert(text, IsBase, Bert_name):  
    
        start = time.time()
        if IsBase==True:            
            model = SentenceTransformer(Bert_name, device = 'cuda:0')  # model  bert-base-uncased           
        else:     
                     
            word_embedding_model = models.Transformer(Bert_name)
            # Apply mean pooling to get one fixed sized sentence vector
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                           pooling_mode_mean_tokens=True,
                                           pooling_mode_cls_token=False,
                                           pooling_mode_max_tokens=False)
            model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device = 'cuda:0')

        
        #Sentences are encoded by calling model.encode()
        sentence_vectors = model.encode(text,show_progress_bar=True, batch_size = 500)            

        end = time.time()

        print("Time for creating "+ str(len(sentence_vectors))+" embedding vectors " + str((end - start)/60))
        print('Model used :'+ Bert_name )

        return sentence_vectors
    
def get_similarity_matrix(df,metric = "cosine"):
     
    df = df.to_list()    
    A =  np.array(df,dtype=float)
    A_sparse = sparse.csr_matrix(A)

    if (metric=="cosine"):
        similarities = cosine_similarity(A_sparse)
        similarities_norm = (1/(1+similarities))                #      (1/(1+similarities))
    elif(metric=="euclidean"):
        similarities = euclidean_distances(A_sparse)
        similarities_norm= (1/(1+similarities))
    return np.mean(similarities_norm, axis=0)

def get_lof_score(dat):
    ad = []
    try:
        lof = LocalOutlierFactor(n_neighbors = neighbours,metric='cosine')
        embeds = dat.to_list()
        lof.fit_predict(embeds).tolist()
        return 1 - (1/(1-(lof.negative_outlier_factor_)))
    except:
        return [0.51]*len(dat)

def create_embeddings(df):
    sentences_lst = df["source_text_sentences"].tolist() #answer
    embeddings = embeddings_sentence_bert(sentences_lst, True, base_model_name)
    df["source_text_sentences_embed_base"] =  embeddings.tolist()
    return df

In [8]:
data = pd.read_pickle(data_import_path)
len(data)

12891

In [9]:
data.describe()

Unnamed: 0,id,statementTokensLength,explanationTokensLength,shortExplanationTokensLength,statementexplanationTokensLength
count,12891.0,12891.0,12891.0,12891.0,12891.0
mean,6491.162672,18.412691,775.487937,85.352416,793.333721
std,3752.628466,8.066301,288.562379,42.275294,288.266961
min,1.0,3.0,23.0,2.0,45.0
25%,3241.5,13.0,570.0,58.0,588.0
50%,6481.0,17.0,736.0,80.0,755.0
75%,9744.5,23.0,934.0,106.0,952.0
max,12994.0,73.0,2912.0,1121.0,2899.0


In [10]:
data = split_data(data, source_col_name, targer_col_name)

In [11]:
data_shorter = select_longer_claims(data)
len(data_shorter)

465236

In [12]:
data_shorter.to_pickle("Data/backup/data150.pickle")

In [13]:
data_shorter = pd.read_pickle("Data/backup/data150.pickle")

In [14]:
data_shorter.columns

Index(['id', 'reviewer', 'date', 'statement', 'explanation',
       'shortExplanation', 'truth_o_meter', 'tags', 'url', 'statement_prep',
       'explanation_prep', 'shortExplanation_prep',
       'statement_explanation_prep', 'statementTokensLength',
       'explanationTokensLength', 'shortExplanationTokensLength',
       'statementexplanationTokensLength', 'label', 'source_text',
       'target_text', 'source_text_sentences', 'source_text_sentences_len',
       'source_text_sentences_index'],
      dtype='object')

In [15]:
#model = SentenceTransformer(base_model_name, device = 'cuda:0')
sentences_lst = data_shorter["source_text_sentences"].tolist()

In [16]:
len(data_shorter)

465236

In [17]:
embeddings = embeddings_sentence_bert(sentences_lst, True, base_model_name)               #  model.encode(sentences_lst, show_progress_bar=True, batch_size = 500)  

Batches:   0%|          | 0/931 [00:00<?, ?it/s]

Time for creating 465236 embedding vectors 2.89707262913386
Model used :all-mpnet-base-v2


In [18]:
#with open(data_embeddings_dir, 'wb') as f:
#    pickle.dump(embeddings, f)

# Read data and convert huge numpy array to list of lists

In [19]:
#data_shorter = pd.read_pickle("data77.pickle")
#with open(data_embeddings_dir, 'rb') as f:
#    embeddings = pickle.load(f)

In [20]:
data = pd.DataFrame([[i] for i in tqdm(embeddings)]).rename(columns={0:'source_text_sentences_embed_base'})

  0%|          | 0/465236 [00:00<?, ?it/s]

In [21]:
data_shorter = pd.concat([data_shorter.reset_index(), data.reset_index()], axis=1)

In [22]:
#data_shorter

In [23]:
lof = LocalOutlierFactor(n_neighbors = neighbours, metric = 'cosine')

In [24]:
from tqdm import tqdm
tqdm.pandas()

In [25]:
data_shorter["LOF_base"] = data_shorter.groupby('id',sort = False)['source_text_sentences_embed_base'].progress_apply(get_lof_score).explode().to_list()
data_shorter.to_pickle(data_with_similarity_dir)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12891/12891 [00:13<00:00, 964.26it/s]


In [26]:
data_shorter.columns

Index(['index', 'id', 'reviewer', 'date', 'statement', 'explanation',
       'shortExplanation', 'truth_o_meter', 'tags', 'url', 'statement_prep',
       'explanation_prep', 'shortExplanation_prep',
       'statement_explanation_prep', 'statementTokensLength',
       'explanationTokensLength', 'shortExplanationTokensLength',
       'statementexplanationTokensLength', 'label', 'source_text',
       'target_text', 'source_text_sentences', 'source_text_sentences_len',
       'source_text_sentences_index', 'index',
       'source_text_sentences_embed_base', 'LOF_base'],
      dtype='object')

# Concatenate without embeddings (if too big numpy array)

In [2]:
data  = pd.read_pickle("./Data/Similarity/data_whole_title_embed_text_title_30K.pickle")
data_shorter  = pd.read_pickle("./Data/Similarity/data_whole_title_embed_text_title_150K.pickle")

In [3]:
data = data[data.type != "train"]

In [4]:
data.drop('source_text_sentences_embed_base', inplace=True, axis=1)
data_shorter.drop('source_text_sentences_embed_base', inplace=True, axis=1)

In [6]:
data_res = pd.concat([data, data_shorter], axis=0)

In [10]:
data_res.to_pickle(data_with_similarity_dir)