In [11]:
import numpy as np
import os
import glob
import json
import pandas as pd
from pprint import pprint

In [2]:
main_path = "../zetaobjects_v1/"

In [3]:
def load_paper(paper_id, papers_df):
    return papers_df[papers_df["za_id"] == paper_id]

In [4]:
def load_chunk_papers(chunk_path):
    sub_chunks = [f"{chunk_path}/{el}" for el in os.listdir(chunk_path)]
    return sub_chunks

In [5]:
def load_subchunk_info(subchunk_path):
    info_path = f"{subchunk_path}/main"
    try:
        with open(info_path, 'r') as j:
            paper_info = json.loads(j.read())
            return paper_info
    except FileNotFoundError:
        return {}

In [13]:
def load_subchunk_representations(subchunk_path):
    try:
        sentences_path = glob.glob( f"{subchunk_path}/representations/text_sentences*")[0]
    except IndexError:
        sentences_path = "None"
        
    try:
        embedding_path = glob.glob( f"{subchunk_path}/representations/document_embedding*")[0]
        with open(embedding_path, 'r') as j:
            embedding_dir = "../"+json.loads(j.read())["value"]["pointer"][6:]
    except IndexError:
        embedding_dir = "None"
        
    try:
        sbert_merged_embedding_path = f"{subchunk_path}/representations/sbert_embedding_merged"
        with open(sbert_merged_embedding_path, 'r') as j:
            sbert_merged_embedding_dir = "../"+json.loads(j.read())["value"]["pointer"][6:]
    except IndexError:
        sbert_merged_embedding_dir = "None"
        
    try:
        sbert_separate_embedding_path = f"{subchunk_path}/representations/sbert_embedding_separate"
        with open(sbert_separate_embedding_path, 'r') as j:
            sbert_separate_embedding_dir = "../"+json.loads(j.read())["value"]["pointer"][6:]
    except IndexError:
        sbert_separate_embedding_dir = "None"
    
    try:
        tfidf_merged_embedding_path = f"{subchunk_path}/representations/tfidf_vector_merged"
        with open(tfidf_merged_embedding_path, 'r') as j:
            tfidf_merged_embedding_dir = "../"+json.loads(j.read())["value"]["pointer"][6:]
    except IndexError:
        tfidf_merged_embedding_dir = "None"
        
    try:
        tfidf_separate_embedding_path = f"{subchunk_path}/representations/tfidf_vector_separate"
        with open(tfidf_separate_embedding_path, 'r') as j:
            tfidf_separate_embedding_dir = "../"+json.loads(j.read())["value"]["pointer"][6:]
    except IndexError:
        tfidf_separate_embedding_dir = "None"
    
    return {"fulltext_sentences_dir": sentences_path,
           "za_scibert_embedding_dir": embedding_dir,
           "mb_sbert_embedding_merged_dir": sbert_merged_embedding_dir,
           "mb_sbert_embedding_separate_dir": sbert_separate_embedding_dir,
           "mb_tfidf_embedding_merged_dir": tfidf_merged_embedding_dir,
           "mb_tfidf_embedding_separate_dir": tfidf_separate_embedding_dir}

In [14]:
def load_sentences(sentences_path):
    with open(sentences_path, 'r') as j:
        sentences = json.loads(j.read())["value"]
        # MORE PREPROCESSING!
    return sentences

In [17]:
def load_all_paper_data(main_path=main_path):
    all_chunks = os.listdir(main_path)
    all_subchunks = [el for sublist in [load_chunk_papers(f"{main_path}{chunk}") for chunk in all_chunks if chunk not in ["embeddings",
                                                                                                                          "sbert_embedings_abstract_title_merged", 
                                                                                                                          "sbert_embedings_abstract_title_separate",
                                                                                                                          "tfidf_vector_abstract_title_merged",
                                                                                                                          "tfidf_vector_abstract_title_separate"]] for el in sublist]
    all_papers_info = []
    
    def try_retrieve(source, value1, value2=None):
        if value2:
            try:
                return info[value1][value2]
            except KeyError:
                return "None"
        else:
            try:
                return info[value1]
            except KeyError:
                return "None"
    
    for subchunk in all_subchunks:
        info = load_subchunk_info(subchunk)
        representations = load_subchunk_representations(subchunk)
        paper_data = {
            "title": try_retrieve(info, "metadata", "DCMI.title"),
            "abstract": try_retrieve(info, "metadata", "DCMI.abstract"),
            "date": try_retrieve(info, "metadata", "DCMI.created"),
            "authors": try_retrieve(info, "metadata", "DCMI.creator"),
            "format": try_retrieve(info, "metadata", "DCMI.format"),
            "subjects": try_retrieve(info, "metadata", "DCMI.subject"),
            "za_id": try_retrieve(info, "guid"),
            "uri": try_retrieve(info, "uri"),
            "full_sentences_path": representations["fulltext_sentences_dir"],
            "full_scibert_embedding_path": representations["za_scibert_embedding_dir"],
            "full_sbert_embedings_abstract_title_merged_path": representations["mb_sbert_embedding_merged_dir"],
            "full_sbert_embedings_abstract_title_separate_path": representations["mb_sbert_embedding_separate_dir"],
            "tfidf_embeddings_abstract_title_merged_path": representations["mb_tfidf_embedding_merged_dir"],
            "tfidf_embeddings_abstract_title_separate_path": representations["mb_tfidf_embedding_separate_dir"],
        }
        all_papers_info.append(paper_data)
    
    papers_df = pd.DataFrame(all_papers_info)
    
    return papers_df

In [26]:
all_papers = load_all_paper_data()

In [28]:
len(all_papers)

468

In [48]:
all_papers.iloc[328].full_sbert_embedings_abstract_title_merged_path      

'../zetaobjects_v1/sbert_embedings_abstract_title_merged/2484/2484d98c96ce21595ff0e664e0ac5e993b17ac71/embedding.npy'

In [50]:
# all_papers.to_pickle("../Pickles/all_papers_with_added_sbert_embeddings.pkl")

In [16]:
a = np.random.lognormal(sigma=1.05, size=vec1.shape)

In [17]:
import scipy.spatial

In [18]:
# This is a handy function! ALso accepts lists of vectors
distances = 1 - scipy.spatial.distance.cdist([vec1], [vec2], "cosine")[0]

In [19]:
distances

array([0.9564087])

# Functions for loading the embedding and corresponding abstract / title for the visualisation purpose.

### We want both functions that just load a single paper given the path AND we would want to have a way to load all papers of a first author, to also visualize papers within author.

In [30]:
all_papers = pd.read_pickle("../Pickles/all_papers_with_tfidf.pkl")

In [37]:
all_papers.columns

Index(['title', 'abstract', 'date', 'authors', 'format', 'subjects', 'za_id',
       'uri', 'full_sentences_path', 'full_scibert_embedding_path',
       'full_sbert_embedings_abstract_title_merged_path',
       'full_sbert_embedings_abstract_title_separate_path', 'abstract_sents',
       'abstract_and_title_sents', 'first_author',
       'tfidf_embeddings_abstract_title_merged_path',
       'tfidf_embeddings_abstract_title_separate_path'],
      dtype='object')

In [31]:
def load_za_paper_embedding(embedding_path):
#     Sometimes, the path looks like this: '../zetaobjects_v1/sbert_embedings_abstract_title_merged/None/None/embedding.npy'. Fix this before.
    try:
        return np.load(embedding_path)
    except FileNotFoundError:
        return None

In [73]:
vec1 = load_za_paper_embedding(all_papers.iloc[350].tfidf_embeddings_abstract_title_merged_path)
vec2 = load_za_paper_embedding(all_papers.iloc[54].tfidf_embeddings_abstract_title_merged_path)

In [75]:
# vecs = np.matrix([vec1, vec2])
# corr_matrix = ((vecs * vecs.T).A)
# corr_matrix

In [39]:
def load_for_main_author(author, data):
    pass

In [76]:
all_merged_paper_vectors_for_tsne = [(row.subjects, row.title, row.abstract, load_za_paper_embedding(row.tfidf_embeddings_abstract_title_merged_path)) for index, row in all_papers.iterrows()]

In [77]:
all_merged_paper_vectors_for_tsne_df = pd.DataFrame(all_merged_paper_vectors_for_tsne, columns=["subjects", "title", "abstract", "features"])

In [78]:
all_merged_paper_vectors_for_tsne_df.dropna(inplace=True)

In [79]:
all_merged_paper_vectors_for_tsne_df_features = pd.DataFrame(all_merged_paper_vectors_for_tsne_df["features"].values.tolist())

In [90]:
all_merged_paper_vectors_for_tsne_df_features.to_csv("../Viz/Data/subset_features_tfidf_merged.csv", sep="\t", index=False)

In [17]:
# pd.DataFrame([" | ".join(value) for value in all_merged_paper_vectors_for_tsne_df[0].values.tolist()] , columns=["title", "abstract"]).to_csv("subset_subject_title_abstract_sbert_merged.csv", sep="\t", index=False)

In [20]:
# all_merged_paper_vectors_for_tsne_df["subjects"] = [" | ".join(value) for value in all_merged_paper_vectors_for_tsne_df["subjects"].values.tolist()]

In [91]:
all_merged_paper_vectors_for_tsne_df[["title"]].to_csv("../Viz/Data/subset_title_tfidf_merged.csv", sep="\t", index=False)

In [86]:
# all_merged_paper_vectors_for_tsne_df[["features"]].to_csv("../Viz/Data/subset_features_tfidf_merged.csv", sep="\t", index=False)

In [88]:
len(all_merged_paper_vectors_for_tsne_df[["abstract"]])

454

In [102]:
all_papers

Unnamed: 0,title,abstract,date,authors,format,subjects,za_id,uri,full_sentences_path,full_scibert_embedding_path,full_sbert_embedings_abstract_title_merged_path,full_sbert_embedings_abstract_title_separate_path,abstract_sents,abstract_and_title_sents,first_author,tfidf_embeddings_abstract_title_merged_path,tfidf_embeddings_abstract_title_separate_path
0,Exploiting Uncertainty of Loss Landscape for S...,We introduce novel variants of momentum by i...,2019-05-30T00:00:00+00:00,"[{'first_name': 'Vineeth S.', 'last_name': 'Bh...",scientific paper,"[cs.LG, math.OC, stat.ML]",24cf70e19fb2a3aebf0ee008ec2f47a86ca5a219,https://arxiv.org/abs/1905.13200,../zetaobjects_v1/24cf/24cf70e19fb2a3aebf0ee00...,../zetaobjects_v1/embeddings/24cf/24cf70e19fb2...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[We introduce novel variants of momentum by in...,[Exploiting Uncertainty of Loss Landscape for ...,"{'first_name': 'Vineeth S.', 'last_name': 'Bha...",../zetaobjects_v1/tfidf_vector_abstract_title_...,../zetaobjects_v1/tfidf_vector_abstract_title_...
1,Understanding Semantics from Speech Through Pr...,End-to-end Spoken Language Understanding (SL...,2019-09-24T00:00:00+00:00,"[{'first_name': 'Pengwei', 'last_name': 'Wang'...",scientific paper,"[eess.AS, cs.CL, cs.LG]",24c61c200a3b0438770dc7209b7b4ed25c5636d2,https://arxiv.org/abs/1909.10924,../zetaobjects_v1/24c6/24c61c200a3b0438770dc72...,../zetaobjects_v1/embeddings/24c6/24c61c200a3b...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[End-to-end Spoken Language Understanding (SLU...,[Understanding Semantics from Speech Through P...,"{'first_name': 'Pengwei', 'last_name': 'Wang',...",../zetaobjects_v1/tfidf_vector_abstract_title_...,../zetaobjects_v1/tfidf_vector_abstract_title_...
2,Baselines and a datasheet for the Cerema AWP d...,This paper presents the recently published C...,2018-06-11T00:00:00+00:00,"[{'first_name': 'Ismaïla', 'last_name': 'Seck'...",scientific paper,"[cs.LG, stat.ML]",2492af87a95d96e1d6a7080765438e63c27bc7a3,https://arxiv.org/abs/1806.04016,../zetaobjects_v1/2492/2492af87a95d96e1d6a7080...,../zetaobjects_v1/embeddings/2492/2492af87a95d...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[This paper presents the recently published Ce...,[Baselines and a datasheet for the Cerema AWP ...,"{'first_name': 'Ismaïla', 'last_name': 'Seck',...",../zetaobjects_v1/tfidf_vector_abstract_title_...,../zetaobjects_v1/tfidf_vector_abstract_title_...
3,On the Use of Sparse Filtering for Covariate S...,In this paper we formally analyse the use of...,2016-07-22T00:00:00+00:00,"[{'first_name': 'Fabio Massimo', 'last_name': ...",scientific paper,"[cs.LG, stat.ML]",24b9fdcca34f3737aca7e5c37e9afab0570b577f,https://arxiv.org/abs/1607.06781,../zetaobjects_v1/24b9/24b9fdcca34f3737aca7e5c...,../zetaobjects_v1/embeddings/24b9/24b9fdcca34f...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[In this paper we formally analyse the use of ...,[On the Use of Sparse Filtering for Covariate ...,"{'first_name': 'Fabio Massimo', 'last_name': '...",../zetaobjects_v1/tfidf_vector_abstract_title_...,../zetaobjects_v1/tfidf_vector_abstract_title_...
4,Finding Bottlenecks: Predicting Student Attrit...,With pressure to increase graduation rates a...,2017-05-07T00:00:00+00:00,"[{'first_name': 'Seyed', 'last_name': 'Sajjadi...",scientific paper,"[stat.ML, cs.AI, cs.CY, cs.LG, stat.AP]",24b9dcb7d725224375e61dcb10f19032b59e4dfd,https://arxiv.org/abs/1705.02687,../zetaobjects_v1/24b9/24b9dcb7d725224375e61dc...,../zetaobjects_v1/embeddings/24b9/24b9dcb7d725...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[With pressure to increase graduation rates an...,[Finding Bottlenecks: Predicting Student Attri...,"{'first_name': 'Seyed', 'last_name': 'Sajjadi'...",../zetaobjects_v1/tfidf_vector_abstract_title_...,../zetaobjects_v1/tfidf_vector_abstract_title_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,Systematic and multifactor risk models revisited,Systematic and multifactor risk models are r...,2013-12-18T00:00:00+00:00,"[{'first_name': 'Michel', 'last_name': 'Fliess...",scientific paper,"[q-fin.RM, cs.CE, math.LO, q-fin.CP, stat.ML]",240f9c095d834a05dcdba5b47e134f078110cf54,https://arxiv.org/abs/1312.5271,../zetaobjects_v1/240f/240f9c095d834a05dcdba5b...,../zetaobjects_v1/embeddings/240f/240f9c095d83...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[Systematic and multifactor risk models are re...,[Systematic and multifactor risk models revisi...,"{'first_name': 'Michel', 'last_name': 'Fliess'...",../zetaobjects_v1/tfidf_vector_abstract_title_...,../zetaobjects_v1/tfidf_vector_abstract_title_...
451,When are Overcomplete Topic Models Identifiabl...,Overcomplete latent representations have bee...,2013-08-13T00:00:00+00:00,"[{'first_name': 'Animashree', 'last_name': 'An...",scientific paper,"[cs.LG, cs.IR, math.NA, math.ST, stat.ML, stat...",240fcff8d68ddcafef9430ec470e9ed77b83a029,https://arxiv.org/abs/1308.2853,../zetaobjects_v1/240f/240fcff8d68ddcafef9430e...,../zetaobjects_v1/embeddings/240f/240fcff8d68d...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[Overcomplete latent representations have been...,[When are Overcomplete Topic Models Identifiab...,"{'first_name': 'Animashree', 'last_name': 'Ana...",../zetaobjects_v1/tfidf_vector_abstract_title_...,../zetaobjects_v1/tfidf_vector_abstract_title_...
452,Memorizing All for Implicit Discourse Relation...,Implicit discourse relation recognition is a...,2019-08-29T00:00:00+00:00,"[{'first_name': 'Hongxiao', 'last_name': 'Bai'...",scientific paper,"[cs.CL, cs.AI, cs.LG]",249472a243eaab19789f2c63c5485a7c36632b1a,https://arxiv.org/abs/1908.11317,../zetaobjects_v1/2494/249472a243eaab19789f2c6...,../zetaobjects_v1/embeddings/2494/249472a243ea...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[Implicit discourse relation recognition is a ...,[Memorizing All for Implicit Discourse Relatio...,"{'first_name': 'Hongxiao', 'last_name': 'Bai',...",../zetaobjects_v1/tfidf_vector_abstract_title_...,../zetaobjects_v1/tfidf_vector_abstract_title_...
453,Learning Robust Subspace Clustering,We propose a low-rank transformation-learnin...,2013-08-01T00:00:00+00:00,"[{'first_name': 'Qiang', 'last_name': 'Qiu', '...",scientific paper,[cs.CV],2431065b6b52d8b9b9887448515c4b23ccd7d799,https://arxiv.org/abs/1308.0273,../zetaobjects_v1/2431/2431065b6b52d8b9b988744...,../zetaobjects_v1/embeddings/2431/2431065b6b52...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[We propose a low-rank transformation-learning...,"[Learning Robust Subspace Clustering, We propo...","{'first_name': 'Qiang', 'last_name': 'Qiu', 'f...",../zetaobjects_v1/tfidf_vector_abstract_title_...,../zetaobjects_v1/tfidf_vector_abstract_title_...
