In [6]:
# https://github.com/openai/openai-python/blob/main/examples/embeddings/Get_embeddings.ipynb
# %pip install openai

#Note
Output embedding model contains only one row per  document. All documents with missing PMID or abstracts or very long abstracts (>2000 tokens) are discarded.

# Input
- `data/pubmed/abstracts.csv.gz` contains raw un-preprocessed texts. Duplicate articles will be discarded to speed up model fitting.

# Outputs

- `models/abstracts_gpt3curie.npz` for the embedding weights one row per document.
- `models/abstracts_pmids_gpt3curie.csv` includes PMIDs for the rows of the above matrix. THis can be use to connect weights to the actual PubMed datasets.

In [47]:
import numpy as np
from tqdm import tqdm
import openai
from python.cogtext.datasets.pubmed import PubMedDataLoader
from python.cogtext.similarity_matrix import get_similarity_matrix

pubmed = PubMedDataLoader(preprocessed=False, drop_low_occurred_labels=True).load()
pubmed = pubmed.query('pmid.notna() and abstract.notna()')
pubmed['abstract'] = pubmed['abstract'].apply(lambda x: x.replace('\n', ' '))
pubmed = pubmed.drop_duplicates(subset=['pmid'])
len(pubmed)

pubmed[['pmid']].to_csv('models/abstracts_pmids_gpt3curie.npz')

385705

In [80]:
import re
# # pubmed.loc[10:12,'abstract'].tolist()
# long_abstracts  = pubmed['abstract'].apply(lambda x: len(x.split('')) < 1000)
# # pubmed = pubmed[~long_abstracts]

abstract_tokens = pubmed['abstract'].apply(lambda x: len(re.split('\W+|\s+', x)))
pubmed = pubmed[abstract_tokens < 2000]

In [None]:
OUTPUT_PATH = Path('models/abstracts_gpt3curie.npz')

n_fitted_records = 0

if OUTPUT_PATH.exist():
  pre_cached = np.load(OUTPUT_PATH)['arr_0']
  n_fitted_records = pre_cached.shape[0]


In [88]:
chunk_size = 5

model = openai.Engine(id='curie-similarity')

cached = []

for i in tqdm(1,10): #range(n_fitted_records, len(pubmed), chunk_size), unit='doc'):
  chunk = pubmed[i:i+chunk_size]['abstract'].to_list()
  try:
    chunk_embeddings = model.embeddings(input=chunk)['data']
    cached.extend(chunk_embeddings)
  except Exception as e:
    cached.extend([None] * len(chunk))
    print('error, skipping chunk...')


print(len(cached))

cached_arr = np.array(cached)

if n_fitted_records > 0:
  cached_arr = np.vstack(pre_cached)

np.save('models/abstracts_gpt3curie.npz', cached_arr)

  7%|▋         | 275/3858 [17:12<2:20:10,  2.35s/doc]

error, skipping chunk...


  7%|▋         | 277/3858 [17:16<1:54:06,  1.91s/doc]

error, skipping chunk...


 34%|███▍      | 1308/3858 [1:24:36<2:04:59,  2.94s/doc]

error, skipping chunk...


 41%|████      | 1568/3858 [1:44:05<2:32:01,  3.98s/doc]


KeyboardInterrupt: 

In [92]:
cached_arr = np.array(cached)
cached_arr

array([<OpenAIObject embedding at 0x15ea6f270> JSON: {
         "embedding": [
           0.006812068168073893,
           0.012163734063506126,
           0.007386806886643171,
           0.022160420194268227,
           -0.016092685982584953,
           0.01921135187149048,
           -0.0029090263415127993,
           -0.02099210023880005,
           0.004013749770820141,
           0.0018007697071880102,
           -0.031205490231513977,
           -0.020294874906539917,
           0.0052857124246656895,
           -0.005959381815046072,
           0.013567605055868626,
           -0.014462689869105816,
           -0.0027724080719053745,
           -0.013407431542873383,
           0.012163734063506126,
           0.016676846891641617,
           0.006298571825027466,
           0.006962819490581751,
           -0.006741403602063656,
           0.0025203709956258535,
           0.0099966861307621,
           0.030376359820365906,
           -0.015150492079555988,
           -0.0053