In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from tqdm import tqdm

import numpy as np
from umap.umap_ import UMAP
import joblib

from scipy.spatial import distance

batch_size = 1000

import settings
dir_path = settings.DIR_PATH

def cosine_similarity(vector1, vector2):
    return 1 - distance.cosine(vector1, vector2)

def mahalanobis_distance(x, y, inv_cov):
    return distance.mahalanobis(x, y, inv_cov)

def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [None]:
main_df = pd.read_pickle(dir_path + 'dataset_v2.pkl')

## loading model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model and send them to device
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to(device)

## abstract embedding

In [None]:
df = main_df[['_id','ref_title','abstract','ref_abstract']].dropna().copy()
df_a = df[['_id','abstract']].drop_duplicates()
df_b = df[['_id','ref_title','ref_abstract']].drop_duplicates()

abstracts = df_a['abstract'].fillna('')

embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU

all_embeddings = np.vstack(embeddings)

abstracts = df_b['ref_abstract'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU

all_ref_embeddings = np.vstack(embeddings)

labels = np.array(['original'] * len(all_embeddings) + ['reference'] * len(all_ref_embeddings))

combined_embeddings = np.concatenate([all_embeddings, all_ref_embeddings])


In [None]:


# UMAP instance creation with n_neighbors and min_dist as optional parameters
umap_reducer = UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

# Apply UMAP on the combined embeddings
umap_results = umap_reducer.fit_transform(combined_embeddings)

list_of_lists = [[row.tolist()] for row in all_embeddings]
temp1 = pd.DataFrame(list_of_lists, columns=['title_all_embeddings'])

list_of_lists = [[row.tolist()] for row in all_ref_embeddings]
temp2 = pd.DataFrame(list_of_lists, columns=['title_all_ref_embeddings'])

len_all = len(all_embeddings)
len_ref = len(all_ref_embeddings)

umap_all_embeddings = umap_results[:len_all]
umap_all_ref_embeddings = umap_results[len_all:len_all + len_ref]

temp = pd.DataFrame(umap_all_embeddings)
temp.columns = ['umap_1','umap_2']

df_a_umap = pd.concat([df_a.reset_index(drop = True),temp],axis = 1)
df_a_umap = pd.concat([df_a_umap,temp1],axis = 1)

temp = pd.DataFrame(umap_all_ref_embeddings)
temp.columns = ['umap_ref_1','umap_ref_2']

df_b_umap = pd.concat([df_b.reset_index(drop = True),temp],axis = 1)
df_b_umap = pd.concat([df_b_umap,temp2],axis = 1)

df_new = df.merge(df_a_umap[['_id','umap_1','umap_2','title_all_embeddings']], on = ['_id'],how = 'left')
df_new = df_new.merge(df_b_umap[['_id','ref_title','umap_ref_1','umap_ref_2','title_all_ref_embeddings']], on = ['_id','ref_title'],how = 'left')


df_new['cosine_similarity'] = df_new.apply(
    lambda row: cosine_similarity(row['title_all_embeddings'], row['title_all_ref_embeddings']), axis=1
)

all_vectors = np.vstack([df_new['title_all_embeddings'].tolist(), df_new['title_all_ref_embeddings'].tolist()])
cov_matrix = np.cov(all_vectors.T)
inv_cov_matrix = np.linalg.inv(cov_matrix)

df_new['mahalanobis_distance'] = df_new.apply(
    lambda row: mahalanobis_distance(row['title_all_embeddings'], row['title_all_ref_embeddings'], inv_cov_matrix), axis=1
)

df_new.drop(columns = {'title_all_embeddings','title_all_ref_embeddings','abstract','ref_abstract'}).to_pickle(dir_path + 'abstract_umap_v2.pkl')

df_new.head()



In [None]:
df_new

### title embedding

In [None]:

df = main_df[['_id','title','ref_title']].dropna().copy()
df_a = df[['_id','title']].drop_duplicates()
df_b = df[['_id','ref_title']].drop_duplicates()

abstracts = df_a['title'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU

# Concatenate all batch embeddings
all_embeddings = np.vstack(embeddings)

abstracts = df_b['ref_title'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU


# Concatenate all batch embeddings
all_ref_embeddings = np.vstack(embeddings)

labels = np.array(['original'] * len(all_embeddings) + ['reference'] * len(all_ref_embeddings))

combined_embeddings = np.concatenate([all_embeddings, all_ref_embeddings])

# UMAP instance creation with n_neighbors and min_dist as optional parameters
umap_reducer = UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

# Apply UMAP on the combined embeddings
umap_results = umap_reducer.fit_transform(combined_embeddings)

list_of_lists = [[row.tolist()] for row in all_embeddings]
temp1 = pd.DataFrame(list_of_lists, columns=['title_all_embeddings'])

list_of_lists = [[row.tolist()] for row in all_ref_embeddings]
temp2 = pd.DataFrame(list_of_lists, columns=['title_all_ref_embeddings'])

len_all = len(all_embeddings)
len_ref = len(all_ref_embeddings)

umap_all_embeddings = umap_results[:len_all]
umap_all_ref_embeddings = umap_results[len_all:len_all + len_ref]

temp = pd.DataFrame(umap_all_embeddings)
temp.columns = ['umap_1','umap_2']

df_a_umap = pd.concat([df_a.reset_index(drop = True),temp],axis = 1)
df_a_umap = pd.concat([df_a_umap,temp1],axis = 1)

temp = pd.DataFrame(umap_all_ref_embeddings)
temp.columns = ['umap_ref_1','umap_ref_2']

df_b_umap = pd.concat([df_b.reset_index(drop = True),temp],axis = 1)
df_b_umap = pd.concat([df_b_umap,temp2],axis = 1)

df_new = df.merge(df_a_umap[['_id','umap_1','umap_2','title_all_embeddings']], on = ['_id'],how = 'left')
df_new = df_new.merge(df_b_umap[['_id','ref_title','umap_ref_1','umap_ref_2','title_all_ref_embeddings']], on = ['_id','ref_title'],how = 'left')


df_new['cosine_similarity'] = df_new.apply(
    lambda row: cosine_similarity(row['title_all_embeddings'], row['title_all_ref_embeddings']), axis=1
)


all_vectors = np.vstack([df_new['title_all_embeddings'].tolist(), df_new['title_all_ref_embeddings'].tolist()])
cov_matrix = np.cov(all_vectors.T)
inv_cov_matrix = np.linalg.inv(cov_matrix)

df_new['mahalanobis_distance'] = df_new.apply(
    lambda row: mahalanobis_distance(row['title_all_embeddings'], row['title_all_ref_embeddings'], inv_cov_matrix), axis=1
)

df_new.drop(columns = {'title','title_all_embeddings','title_all_ref_embeddings'}).to_pickle(dir_path + 'title_umap_v2.pkl')


### context embedding

In [None]:
df = main_df[['_id','title','ref_title','Pre Text','Post Text']].dropna().copy()
df_a = df[['_id','ref_title','Pre Text']].drop_duplicates()
df_b = df[['_id','ref_title','Post Text']].drop_duplicates()

abstracts = df_a['Pre Text'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU

# Concatenate all batch embeddings
all_embeddings = np.vstack(embeddings)

abstracts = df_b['Post Text'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU


# Concatenate all batch embeddings
all_ref_embeddings = np.vstack(embeddings)

labels = np.array(['original'] * len(all_embeddings) + ['reference'] * len(all_ref_embeddings))

combined_embeddings = np.concatenate([all_embeddings, all_ref_embeddings])



# Concatenate all batch embeddings
all_ref_embeddings = np.vstack(embeddings)

labels = np.array(['original'] * len(all_embeddings) + ['reference'] * len(all_ref_embeddings))

combined_embeddings = np.concatenate([all_embeddings, all_ref_embeddings])

# UMAP instance creation with n_neighbors and min_dist as optional parameters
umap_reducer = UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

# Apply UMAP on the combined embeddings
umap_results = umap_reducer.fit_transform(combined_embeddings)

list_of_lists = [[row.tolist()] for row in all_embeddings]
temp1 = pd.DataFrame(list_of_lists, columns=['title_all_embeddings'])

list_of_lists = [[row.tolist()] for row in all_ref_embeddings]
temp2 = pd.DataFrame(list_of_lists, columns=['title_all_ref_embeddings'])

len_all = len(all_embeddings)
len_ref = len(all_ref_embeddings)

umap_all_embeddings = umap_results[:len_all]
umap_all_ref_embeddings = umap_results[len_all:len_all + len_ref]

temp = pd.DataFrame(umap_all_embeddings)
temp.columns = ['umap_1','umap_2']

df_a_umap = pd.concat([df_a.reset_index(drop = True),temp],axis = 1)
df_a_umap = pd.concat([df_a_umap,temp1],axis = 1)

temp = pd.DataFrame(umap_all_ref_embeddings)
temp.columns = ['umap_ref_1','umap_ref_2']

df_b_umap = pd.concat([df_b.reset_index(drop = True),temp],axis = 1)
df_b_umap = pd.concat([df_b_umap,temp2],axis = 1)

temp = pd.DataFrame(umap_all_ref_embeddings)
temp.columns = ['umap_ref_1','umap_ref_2']

df_b_umap = pd.concat([df_b.reset_index(drop = True),temp],axis = 1)
df_b_umap = pd.concat([df_b_umap,temp2],axis = 1)

df_new = df.merge(df_a_umap[['_id','umap_1','umap_2','title_all_embeddings']], on = ['_id'],how = 'left')
df_new = df_new.merge(df_b_umap[['_id','ref_title','umap_ref_1','umap_ref_2','title_all_ref_embeddings']], on = ['_id','ref_title'],how = 'left')


df_new['cosine_similarity'] = df_new.apply(
    lambda row: cosine_similarity(row['title_all_embeddings'], row['title_all_ref_embeddings']), axis=1
)
df_new.drop(columns = {'title','title_all_embeddings','title_all_ref_embeddings'}).to_pickle(dir_path + 'context_umap.pkl')