In [22]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from tqdm import tqdm

import numpy as np
import umap
import joblib

from scipy.spatial import distance

dir_path = '../data/PST/'

In [23]:
def cosine_similarity(vector1, vector2):
    return 1 - distance.cosine(vector1, vector2)

def mahalanobis_distance(x, y, inv_cov):
    return distance.mahalanobis(x, y, inv_cov)

# Function to perform average pooling
def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

## loading model

In [24]:
# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model and send them to device
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to(device)

Using device: cuda




## abstract embedding

In [25]:
df = pd.read_pickle(dir_path + 'dataset_v2.pkl')

df = df[['_id','ref_title','abstract','ref_abstract']].dropna().copy()
df_a = df[['_id','abstract']].drop_duplicates()
df_b = df[['_id','ref_title','ref_abstract']].drop_duplicates()

abstracts = df_a['abstract'].fillna('')

# Set batch size
batch_size = 1000  # Adjust based on your memory capacity

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU

# Concatenate all batch embeddings
all_embeddings = np.vstack(embeddings)

abstracts = df_b['ref_abstract'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU


# Concatenate all batch embeddings
all_ref_embeddings = np.vstack(embeddings)

labels = np.array(['original'] * len(all_embeddings) + ['reference'] * len(all_ref_embeddings))

combined_embeddings = np.concatenate([all_embeddings, all_ref_embeddings])


# UMAP instance creation with n_neighbors and min_dist as optional parameters
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

# Apply UMAP on the combined embeddings
tsne_results = umap_reducer.fit_transform(combined_embeddings)

list_of_lists = [[row.tolist()] for row in all_embeddings]
temp1 = pd.DataFrame(list_of_lists, columns=['title_all_embeddings'])

list_of_lists = [[row.tolist()] for row in all_ref_embeddings]
temp2 = pd.DataFrame(list_of_lists, columns=['title_all_ref_embeddings'])

len_all = len(all_embeddings)
len_ref = len(all_ref_embeddings)

tsne_all_embeddings = tsne_results[:len_all]
tsne_all_ref_embeddings = tsne_results[len_all:len_all + len_ref]

temp = pd.DataFrame(tsne_all_embeddings)
temp.columns = ['tsne_1','tsne_2']

df_a_tsne = pd.concat([df_a.reset_index(drop = True),temp],axis = 1)
df_a_tsne = pd.concat([df_a_tsne,temp1],axis = 1)

temp = pd.DataFrame(tsne_all_ref_embeddings)
temp.columns = ['tsne_ref_1','tsne_ref_2']

df_b_tsne = pd.concat([df_b.reset_index(drop = True),temp],axis = 1)
df_b_tsne = pd.concat([df_b_tsne,temp2],axis = 1)

df_new = df.merge(df_a_tsne[['_id','tsne_1','tsne_2','title_all_embeddings']], on = ['_id'],how = 'left')
df_new = df_new.merge(df_b_tsne[['_id','ref_title','tsne_ref_1','tsne_ref_2','title_all_ref_embeddings']], on = ['_id','ref_title'],how = 'left')


df_new['cosine_similarity'] = df_new.apply(
    lambda row: cosine_similarity(row['title_all_embeddings'], row['title_all_ref_embeddings']), axis=1
)

all_vectors = np.vstack([df_new['title_all_embeddings'].tolist(), df_new['title_all_ref_embeddings'].tolist()])
cov_matrix = np.cov(all_vectors.T)
inv_cov_matrix = np.linalg.inv(cov_matrix)

df_new['mahalanobis_distance'] = df_new.apply(
    lambda row: mahalanobis_distance(row['title_all_embeddings'], row['title_all_ref_embeddings'], inv_cov_matrix), axis=1
)

df_new.drop(columns = {'title_all_embeddings','title_all_ref_embeddings','abstract','ref_abstract'}).to_pickle(dir_path + 'abstract_tsne_v2.pkl')

df_new.head()



Processing batches: 100%|██████████| 6/6 [00:58<00:00,  9.72s/it]
Processing batches: 100%|██████████| 169/169 [29:14<00:00, 10.38s/it]
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


Unnamed: 0,_id,ref_title,abstract,ref_abstract,tsne_1,tsne_2,title_all_embeddings,tsne_ref_1,tsne_ref_2,title_all_ref_embeddings,cosine_similarity,mahalanobis_distance
0,53e99792b7602d9701f57e77,a comparison of the z e 8 and leech lattices ...,the history of the theory and practice of quan...,lattice vector quantization schemes offer high...,2.701674,-1.71012,"[0.26667261123657227, 0.011990290135145187, -0...",2.765877,-1.778382,"[-0.24986909329891205, -0.18758516013622284, -...",0.832934,45.273721
1,53e99792b7602d9701f57e77,a complexity reduction technique for image vec...,the history of the theory and practice of quan...,a technique for reducing the complexity of spa...,2.701674,-1.71012,"[0.26667261123657227, 0.011990290135145187, -0...",2.768089,-1.824385,"[-0.32262706756591797, -0.4273962378501892, -0...",0.822489,42.642719
2,53e99792b7602d9701f57e77,a deterministic annealing approach to clustering,the history of the theory and practice of quan...,,2.701674,-1.71012,"[0.26667261123657227, 0.011990290135145187, -0...",9.171832,15.627504,"[0.2525818645954132, -0.46831443905830383, -0....",0.74207,28.791375
3,53e99792b7602d9701f57e77,a direct proof of the coding theorem for discr...,the history of the theory and practice of quan...,in this paper we provide an alternate method o...,2.701674,-1.71012,"[0.26667261123657227, 0.011990290135145187, -0...",2.60622,-1.652712,"[0.3029211461544037, -0.571161687374115, -0.52...",0.798741,47.187264
4,53e99792b7602d9701f57e77,a fake process approach to data compression,the history of the theory and practice of quan...,the problem of designing a good decoder for a ...,2.701674,-1.71012,"[0.26667261123657227, 0.011990290135145187, -0...",2.676551,-1.768644,"[0.3296864628791809, -0.305050253868103, -0.85...",0.831787,45.805418


### title embedding

In [26]:
df = pd.read_pickle(dir_path + 'dataset_v2.pkl')

df = df[['_id','title','ref_title']].dropna().copy()
df_a = df[['_id','title']].drop_duplicates()
df_b = df[['_id','ref_title']].drop_duplicates()

abstracts = df_a['title'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU

# Concatenate all batch embeddings
all_embeddings = np.vstack(embeddings)

abstracts = df_b['ref_title'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU


# Concatenate all batch embeddings
all_ref_embeddings = np.vstack(embeddings)

labels = np.array(['original'] * len(all_embeddings) + ['reference'] * len(all_ref_embeddings))

combined_embeddings = np.concatenate([all_embeddings, all_ref_embeddings])

# UMAP instance creation with n_neighbors and min_dist as optional parameters
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

# Apply UMAP on the combined embeddings
tsne_results = umap_reducer.fit_transform(combined_embeddings)

list_of_lists = [[row.tolist()] for row in all_embeddings]
temp1 = pd.DataFrame(list_of_lists, columns=['title_all_embeddings'])

list_of_lists = [[row.tolist()] for row in all_ref_embeddings]
temp2 = pd.DataFrame(list_of_lists, columns=['title_all_ref_embeddings'])

len_all = len(all_embeddings)
len_ref = len(all_ref_embeddings)

tsne_all_embeddings = tsne_results[:len_all]
tsne_all_ref_embeddings = tsne_results[len_all:len_all + len_ref]

temp = pd.DataFrame(tsne_all_embeddings)
temp.columns = ['tsne_1','tsne_2']

df_a_tsne = pd.concat([df_a.reset_index(drop = True),temp],axis = 1)
df_a_tsne = pd.concat([df_a_tsne,temp1],axis = 1)

temp = pd.DataFrame(tsne_all_ref_embeddings)
temp.columns = ['tsne_ref_1','tsne_ref_2']

df_b_tsne = pd.concat([df_b.reset_index(drop = True),temp],axis = 1)
df_b_tsne = pd.concat([df_b_tsne,temp2],axis = 1)

df_new = df.merge(df_a_tsne[['_id','tsne_1','tsne_2','title_all_embeddings']], on = ['_id'],how = 'left')
df_new = df_new.merge(df_b_tsne[['_id','ref_title','tsne_ref_1','tsne_ref_2','title_all_ref_embeddings']], on = ['_id','ref_title'],how = 'left')


df_new['cosine_similarity'] = df_new.apply(
    lambda row: cosine_similarity(row['title_all_embeddings'], row['title_all_ref_embeddings']), axis=1
)


all_vectors = np.vstack([df_new['title_all_embeddings'].tolist(), df_new['title_all_ref_embeddings'].tolist()])
cov_matrix = np.cov(all_vectors.T)
inv_cov_matrix = np.linalg.inv(cov_matrix)

df_new['mahalanobis_distance'] = df_new.apply(
    lambda row: mahalanobis_distance(row['title_all_embeddings'], row['title_all_ref_embeddings'], inv_cov_matrix), axis=1
)

df_new.drop(columns = {'title','title_all_embeddings','title_all_ref_embeddings'}).to_pickle(dir_path + 'title_tsne_v2.pkl')


Processing batches: 100%|██████████| 6/6 [00:08<00:00,  1.42s/it]
Processing batches: 100%|██████████| 230/230 [08:32<00:00,  2.23s/it]
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


### context embedding

In [12]:
df = pd.read_pickle(dir_path + 'dataset_v2.pkl')
df = df[['_id','title','ref_title','Pre Text','Post Text']].dropna().copy()
df_a = df[['_id','ref_title','Pre Text']].drop_duplicates()
df_b = df[['_id','ref_title','Post Text']].drop_duplicates()

abstracts = df_a['Pre Text'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU

# Concatenate all batch embeddings
all_embeddings = np.vstack(embeddings)

abstracts = df_b['Post Text'].fillna('')

# Process in batches
embeddings = []
for i in tqdm(range(0, len(abstracts), batch_size), desc="Processing batches"):
    batch_abstracts = abstracts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_abstracts, max_length=256, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Send input to GPU

    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        embeddings.append(batch_embeddings.cpu().numpy())  # Move embeddings back to CPU


# Concatenate all batch embeddings
all_ref_embeddings = np.vstack(embeddings)

labels = np.array(['original'] * len(all_embeddings) + ['reference'] * len(all_ref_embeddings))

combined_embeddings = np.concatenate([all_embeddings, all_ref_embeddings])

Processing batches: 100%|██████████| 223/223 [25:50<00:00,  6.95s/it]
Processing batches: 100%|██████████| 223/223 [24:36<00:00,  6.62s/it]


In [13]:


# Concatenate all batch embeddings
all_ref_embeddings = np.vstack(embeddings)

labels = np.array(['original'] * len(all_embeddings) + ['reference'] * len(all_ref_embeddings))

combined_embeddings = np.concatenate([all_embeddings, all_ref_embeddings])

# UMAP instance creation with n_neighbors and min_dist as optional parameters
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

# Apply UMAP on the combined embeddings
tsne_results = umap_reducer.fit_transform(combined_embeddings)

list_of_lists = [[row.tolist()] for row in all_embeddings]
temp1 = pd.DataFrame(list_of_lists, columns=['title_all_embeddings'])

list_of_lists = [[row.tolist()] for row in all_ref_embeddings]
temp2 = pd.DataFrame(list_of_lists, columns=['title_all_ref_embeddings'])

len_all = len(all_embeddings)
len_ref = len(all_ref_embeddings)

tsne_all_embeddings = tsne_results[:len_all]
tsne_all_ref_embeddings = tsne_results[len_all:len_all + len_ref]

temp = pd.DataFrame(tsne_all_embeddings)
temp.columns = ['tsne_1','tsne_2']

df_a_tsne = pd.concat([df_a.reset_index(drop = True),temp],axis = 1)
df_a_tsne = pd.concat([df_a_tsne,temp1],axis = 1)

temp = pd.DataFrame(tsne_all_ref_embeddings)
temp.columns = ['tsne_ref_1','tsne_ref_2']

df_b_tsne = pd.concat([df_b.reset_index(drop = True),temp],axis = 1)
df_b_tsne = pd.concat([df_b_tsne,temp2],axis = 1)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [16]:
temp = pd.DataFrame(tsne_all_ref_embeddings)
temp.columns = ['tsne_ref_1','tsne_ref_2']

df_b_tsne = pd.concat([df_b.reset_index(drop = True),temp],axis = 1)
df_b_tsne = pd.concat([df_b_tsne,temp2],axis = 1)

df_new = df.merge(df_a_tsne[['_id','tsne_1','tsne_2','title_all_embeddings']], on = ['_id'],how = 'left')
df_new = df_new.merge(df_b_tsne[['_id','ref_title','tsne_ref_1','tsne_ref_2','title_all_ref_embeddings']], on = ['_id','ref_title'],how = 'left')


In [18]:
df_new.columns

Index(['_id', 'title', 'ref_title', 'Pre Text', 'Post Text', 'tsne_1',
       'tsne_2', 'title_all_embeddings', 'tsne_ref_1', 'tsne_ref_2',
       'title_all_ref_embeddings'],
      dtype='object')

In [19]:

df_new['cosine_similarity'] = df_new.apply(
    lambda row: cosine_similarity(row['title_all_embeddings'], row['title_all_ref_embeddings']), axis=1
)
df_new.drop(columns = {'title','title_all_embeddings','title_all_ref_embeddings'}).to_pickle(dir_path + 'context_tsne.pkl')

In [20]:
df_new.columns

Index(['_id', 'title', 'ref_title', 'Pre Text', 'Post Text', 'tsne_1',
       'tsne_2', 'title_all_embeddings', 'tsne_ref_1', 'tsne_ref_2',
       'title_all_ref_embeddings', 'cosine_similarity'],
      dtype='object')