In [1]:
import pickle
import json
from sqlalchemy import create_engine
import pandas as pd
import os
import time
import pickle

# define cache for sentencebert
os.environ['XDG_CACHE_HOME'] = 'home/msds2022/plarosa/ .cache'

from sentence_transformers import SentenceTransformer, util

In [2]:
df = pd.read_excel('../data/cleaned/research_profile_updated.xlsx')

df = df.loc[:, ['Research Title', 'Author', 'Abstract', 
           'University (Abbreviation)', 'Campus']]

df = df.dropna().reset_index(drop=True)

In [3]:
df.to_csv('author_sim_db.csv', index=False)

In [4]:
# load sentence transformer model
# model = SentenceTransformer('allenai/scibert_scivocab_uncased')
model = SentenceTransformer('allenai/specter')
model.max_seq_length = 512



In [74]:
# encode abstract using pretrained model
start_time = time.time()
embeddings = model.encode(df.Abstract.tolist(), convert_to_tensor=True)
end_time = time.time()
print("Time for computing embeddings:", str(end_time - start_time))
abstract_database = embeddings.cpu().numpy()

Time for computing embeddings: 6.022053241729736


In [78]:
with open('abs_embed_db.pickle', 'wb') as f:
    pickle.dump(abstract_database, f)


## Evaluation

In [10]:
n_sim = 5
abs_text = "Investigating congestion in train rapid transit systems (RTS) in today's urban cities is a challenge compounded by limited data availability and difficulties in model validation. Here, we integrate information from travel smart card data, a mathematical model of route choice, and a full-scale agent-based model of the Singapore RTS to provide a more comprehensive understanding of the congestion dynamics than can be obtained through analytical modelling alone. Our model is empirically validated, and allows for close inspection of congestion and scaling dynamics. By adjusting our model, we can estimate the effective capacity of the RTS trains as well as replicate the penultimate station effect, where commuters travel backwards to the preceding station to catch a seat, sacrificing time for comfort. Using current data, the crowdedness in all 121 stations appears to be distributed log-normally. We find that increasing the current population (2 million) beyond a factor of approximately 10% leads to an exponential deterioration in service quality. We also show that incentivizing commuters to avoid the most congested hours can bring modest improvements to the service quality. Finally, our model can be used to generate simulated data for statistical analysis when such data are not empirically available, as is often the case."
def get_most_similar_research(abs_text, n_sim=1):
    with open('models/specter.pickle', 'rb') as f:
        encoder = pickle.load(f)

    author_sim_db = pd.read_csv('author_sim_db.csv')   

    with open('abs_embed_db.pickle', 'rb') as f:
        abs_embed_db = pickle.load(f)    


    abs_emb = encoder.encode(abs_text)
    scores = util.cos_sim(abs_emb, abs_embed_db).numpy()
    arg_scores = scores[0].argsort(kind='mergesort')[::-1]
    top_n_sim = author_sim_db.iloc[arg_scores[:n_sim], :]

    if n_sim == 1:
        return {'most_sim_auth' : top_n_sim.Author.values[0], 
                'most_sim_res' : top_n_sim['Research Title'].values[0], 
                'most_sim_school' : (top_n_sim['University (Abbreviation)'].values[0]
                             + ' (' + top_n_sim['Campus'].values[0]
                             + ')')} 
    else:
        return top_n_sim
    
out = get_most_similar_research(abs_text, n_sim=n_sim)

In [9]:
out.iloc[0]

Research Title               Lum√°wig: An Efficient Algorithm for Dimension...
Author                       Ignacio, Paul Samuel;Bulauan, Jay-Anne;Uminsky...
Abstract                     Stability of persistence diagrams under slight...
University (Abbreviation)                                            UP-Baguio
Campus                                                                  Baguio
Name: 631, dtype: object

## Metrics 

In [90]:
sample = df['Abstract'][0]
sample_emb = model.encode(sample, convert_to_tensor=True).cpu().numpy()
cos_score = util.cos_sim(sample_emb, abstract_database)
dot_score = util.dot_score(sample_emb, abstract_database)
semantic_search = util.semantic_search(sample_emb, abstract_database)

In [98]:
df['score'] = cos_score[0].cpu().numpy()
df['dot_score'] = dot_score[0].cpu().numpy()