In [None]:
# ! gcloud init

In [33]:
! gsutil -m cp papers_set_with_restored_abstract.csv gs://noobs-ml/sem-search/

Copying file://papers_set_with_restored_abstract.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

|
Operation completed over 1 objects/1.4 GiB.                                      


In [None]:
# ! pip install sentence-transformers

In [None]:
import pandas as pd
import os
import gc
import plotly.express as px
import ast
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.spatial
import json
import re
import numpy as np

In [11]:
full_df = pd.read_csv("/content/all_relevant_papers.csv")


Columns (1,6) have mixed types.Specify dtype option on import or set low_memory=False.



In [None]:
all_paper_ids = set(full_df.id)

In [None]:
len_full = len(full_df)
len_with_abstract = len(full_df.dropna(subset=["indexed_abstract"]))
len_with_refs = len(full_df.dropna(subset=["references"]))
len_with_refs_and_abstract = len(full_df.dropna(subset=["references", "indexed_abstract"]))

In [None]:
stats = pd.DataFrame([{"type": "full subset", "len": len_full}, 
                      {"type": "with abstract", "len": len_with_abstract},
                      {"type": "with references", "len": len_with_refs},
                      {"type": "with refs and abstract", "len": len_with_refs_and_abstract}])

In [15]:
stats

Unnamed: 0,type,len
0,full subset,300505
1,with abstract,251140
2,with references,221967
3,with refs and abstract,205340


In [16]:
fig = px.bar(stats, x="len", y="type", color="type", orientation="h", 
             text=["300505 papers  ", "251140 papers  ", "221967 papers  ", "205340 papers  "],
             title="Number of papers in the used data subset that do not miss an abstract or references.")
fig.layout.update(showlegend=False)
fig.show()

In [None]:
authors = pd.read_csv("top_100_authors.csv")

In [None]:
authors.drop(["Unnamed: 0", "Unnamed: 0.1"], inplace=True, axis=1)

In [None]:
one_author = authors.sample()

In [None]:
def get_pub_ids_of_author_by_row(author_row, only_first_author=False):
    one_pubs = ast.literal_eval(one_author.pubs.iloc[0])
    if only_first_author:
        return [v["i"] for v in one_pubs if v['r'] == 0]
    else:
        return [v["i"] for v in one_pubs]


def get_pub_ids_of_author_by_id(author_id, authors_df=full_df, only_first_author=False):
    one_pubs = ast.literal_eval(authors_df[authors_df.id == int(author_id)].pubs.iloc[0])
    if only_first_author:
        return [v["i"] for v in one_pubs if v['r'] == 0]
    else:
        return [v["i"] for v in one_pubs]

In [None]:
ids = get_pub_ids_of_author_by_row(one_author)

In [26]:
sum([not full_df[full_df.id == int(id_)].title.empty for id_ in ids])

372

In [27]:
len(ids)

418

In [None]:
def reconstruct_abstract(row):
    try:
        ia = ast.literal_eval(row.indexed_abstract)["InvertedIndex"]
        greatest_index = max([v for sublist in ia.values() for v in sublist])
        original = [0]*(greatest_index+1)
        for key, value in ia.items():
            for vi in value:
                original[vi] = key
        original = [o for o in original if type(o) == str]
        return ' '.join(original)
    except:
        return ''

In [None]:
full_df["abstract"] = full_df.apply(lambda x: reconstruct_abstract(x), axis=1)

In [None]:
# full_df.to_csv("papers_set_with_restored_abstract.csv")

In [35]:
embedder = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')

100%|██████████| 459M/459M [00:48<00:00, 9.37MB/s]


In [None]:
def get_average_sentences_embedding(sentences, model=embedder):
    embeddings = model.encode(sentences)
    return np.mean(embeddings, axis=0)

def calculate_average_embedding_abstract_and_title(abstract, title, strategy="merge", model=embedder):

    title_embedding = model.encode(title)
    
    if strategy == "merge":
        abstract_embeddings = model.encode(abstract)
        merged = np.concatenate((title_embedding, abstract_embeddings), axis=0)
        average = np.mean(merged, axis=0)
    elif strategy == "separate":
        average_abstract = get_average_sentences_embedding(abstract)
        average = np.mean([average_abstract, title_embedding[0]], axis=0)
    else:
        print("Warning: wrong strategy is used. Use either 'merge' or 'separate'. Proceeding using the 'merge' strategy.")
        abstract_embeddings = model.encode(abstract)
        merged = np.concatenate((title_embedding, abstract_embeddings), axis=0)
        average = np.mean(merged, axis=0)
        
    return average

In [None]:
class Paper:
    def __init__(self, paper_id, papers_df=full_df, all_ids=all_paper_ids):
        self.paper_id = int(paper_id)
        try:
            self.row = papers_df[papers_df.id == self.paper_id]
            self.abstract = self.row.abstract.iloc[0]
            self.title = self.row.title.iloc[0]
            self.all_ids = all_ids
        except:
            return None
        self.data = papers_df
        self.merged_bert_embedding = None
        self.separate_bert_embedding = None
        self.tfidf_embedding = None

    def get_neighbours(self):
        # Return only neighbours that are actually in our dataset.

        refs = self.row.references.iloc[0]
        refs_present_in_data = [r for r in ast.literal_eval(refs) if int(r) in self.all_ids]
        return refs_present_in_data

    def get_sbert_embedding(self, sbert_strategy="merge"):
        if self.abstract:
            if self.title:
                # ... get embedding based on abstract and title
                emb = calculate_average_embedding_abstract_and_title(self.abstract,
                                                                      self.title,
                                                                      sbert_strategy)
                if sbert_strategy == "merge":
                    self.merged_bert_embedding = emb
                else:
                    self.separate_bert_embedding = emb
                return emb
            else:
                # ... get embedding just based on abstract
                # Feed preprocessed clean sentences into this. For now, just use split() on text. 

                emb = get_average_sentences_embedding(self.abstract.split())
                if sbert_strategy == "merge":
                    self.merged_bert_embedding = emb
                else:
                    self.separate_bert_embedding = emb
                return emb
        else:
            if self.title:
                # ... get embedding based on just title
                emb = get_average_sentences_embedding([self.title])
                if sbert_strategy == "merge":
                    self.merged_bert_embedding = emb
                else:
                    self.separate_bert_embedding = emb
                return emb
            else:
                # ... no embedding possible, ignore the paper
                return None

    def get_tfidf_embedding(self, tfidf_strategy="merge"):
        pass

    def get_retrofitted_sbert_embedding(self, sbert_strategy="merge"):
        pass



In [None]:
one_paper = Paper('1885181558')

In [None]:
one_paper.get_neighbours();

In [None]:
one_paper.get_sbert_embedding("separate");