## load datasets

The first step is loading our three datasets(train, dev, test) from the data folder.

In [1]:
import pandas as pd
import json
import os

In [2]:
# set the data directory

datasets = ['train','dev','test']
data_dir = os.path.abspath(r'./')

We need to turn the csv files to the json file, for the convenience in the embedding step.

In [3]:
for dataset in datasets:
    dataset_json = {}
    dataset_pids = []
    data = pd.read_csv(os.path.join(data_dir,'{}.csv'.format(dataset)), dtype=str)
    print(data.tcm.value_counts())
    print(data.columns)

    dataset_pids = list(data.PaperId.values)
    for i,row in data.iterrows():
        title = row['Title']
        abstract = row['Abstract']
        PaperId = row['PaperId']
        dataset_json[PaperId] = {'title': title,
                            'abstract':abstract,
                            'paper_id':PaperId}
        doistr = '\r\n'.join(dataset_pids)
        
    output_id_path = os.path.join(data_dir,'specter_ids_{}.json'.format(dataset))
    with open(output_id_path,'w') as f:
        f.write(doistr)
    output_data_path = os.path.join(data_dir,'specter_data_{}.json'.format(dataset))
    with open(output_data_path,'w') as f:
        json.dump(dataset_json,f)
    print(f'Written {dataset}:', len(dataset_pids), len(dataset_json))

1    25230
0    24857
Name: tcm, dtype: int64
Index(['Unnamed: 0', 'Topic', 'TopicScore', 'PaperId', 'Title', 'Abstract',
       'DOI', 'Year', 'OnlineDate', 'Publisher', 'Journal', 'tcm',
       'title_lang', 'abs_lang'],
      dtype='object')
Written train: 50087 50086
0    5417
1    5316
Name: tcm, dtype: int64
Index(['Unnamed: 0', 'Topic', 'TopicScore', 'PaperId', 'Title', 'Abstract',
       'DOI', 'Year', 'OnlineDate', 'Publisher', 'Journal', 'tcm',
       'title_lang', 'abs_lang'],
      dtype='object')
Written dev: 10733 10733
1    5444
0    5289
Name: tcm, dtype: int64
Index(['Unnamed: 0', 'Topic', 'TopicScore', 'PaperId', 'Title', 'Abstract',
       'DOI', 'Year', 'OnlineDate', 'Publisher', 'Journal', 'tcm',
       'title_lang', 'abs_lang'],
      dtype='object')
Written test: 10733 10733


## embed with API
embedding the data from text to vectors by using api provided by our clients.

In [4]:
# write out
# code adapted from: https://github.com/allenai/paper-embedding-public-apis
from typing import Dict, List
import json
import requests

URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16

def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]

def embed(papers):
    embeddings_by_paper_id: Dict[str, List[float]] = {}
    for chunk in chunks(papers):
        # Allow Python requests to convert the data above to JSON
        response = requests.post(URL, json=chunk)
        if response.status_code != 200:
            raise RuntimeError("Sorry, something went wrong, please try later!")
        for paper in response.json()["preds"]:
            embeddings_by_paper_id[paper["paper_id"]] = paper["embedding"]
    return embeddings_by_paper_id

In [5]:
%%time
datasets = ['train','dev','test']

for dataset in datasets:
    dataset_json = []
    data = pd.read_csv(os.path.join(data_dir,'{}.csv'.format(dataset)), dtype=str)
    for i,row in data.iterrows():
        title = row['Title']
        abstract = row['Abstract']
        paperId = row['PaperId']
        # ensure that everything is right type
        if type(abstract)==type(title)==type(paperId)==str:
            dataset_json.append({'title': title,
                                'abstract':abstract,
                                'paper_id':paperId})
#             doistr = '\r\n'.join(dataset_pids)
    print('EMBEDDING:', len(dataset_json))
    all_embeddings = embed(dataset_json)
    dataset_path = os.path.join(data_dir,f'{dataset} embeddings.json')
    with open(dataset_path, 'w') as f:
        json.dump(all_embeddings,f)

EMBEDDING: 50087
EMBEDDING: 10733
EMBEDDING: 10733
CPU times: user 4min 28s, sys: 23.9 s, total: 4min 52s
Wall time: 3h 3min 24s


## check embeddings

In [28]:
test_data_path = os.path.join(data_dir,'test embeddings.json')
with open(test_data_path,'r') as f:
    test_data = json.load(f)
len(test_data)

10733

assure the length of each embedding vectors is correct

In [10]:
data = pd.read_csv(os.path.join(data_dir,'test.csv'))
for i,row in data.iterrows():
    doi = row['paperId']
    embedding = test_data[doi]
    assert len(embedding)==768
    print(len(embedding),embedding)
    break

768 [-0.2704066336154938, -6.741484642028809, 1.127624273300171, 3.361131429672241, 2.02712082862854, -0.6410074234008789, -2.2783796787261963, 3.0808703899383545, -0.41077226400375366, -4.879283905029297, 4.505235195159912, -1.6533331871032715, 1.558760166168213, -0.8224465250968933, -1.3239092826843262, -1.8863755464553833, -1.8514988422393799, 3.831915855407715, -0.21888302266597748, 3.633793354034424, 0.834517240524292, 2.54437518119812, -1.23715341091156, -0.47197672724723816, -0.34476637840270996, -1.7406948804855347, -2.8470489978790283, -0.29483267664909363, -0.8454023003578186, 3.951598644256592, -7.136223793029785, -3.592519760131836, 2.6059088706970215, 0.43178361654281616, 3.368518590927124, -4.457472801208496, -2.4938812255859375, 5.782375335693359, -2.2960832118988037, 2.511171340942383, 4.560859203338623, -4.615362167358398, 1.379830002784729, -3.1232292652130127, 3.9620652198791504, 1.3823405504226685, -1.5499026775360107, 3.6301379203796387, 1.7579987049102783, 3.25719