In [None]:
# https://github.com/openai/openai-python/blob/main/examples/embeddings/Get_embeddings.ipynb
%pip install openai -U

# Note
Output embedding model contains only one row per  document. All documents with missing PMID or abstracts or very long abstracts are discarded as GPT-3 requires documents to be 2047 tokens or less.

# Input
- `data/pubmed/abstracts.csv.gz` contains raw un-preprocessed texts. Duplicate articles will be discarded to speed up model fitting.

# Outputs

- `models/abstracts_gpt3ada.npz` for the embedding weights; one row per document.
- `models/gpt3/abstracts_pmids_gpt3ada.csv` includes PMIDs for the rows of the above matrix. This can be use to connect weights to the actual PubMed datasets.

In [1]:
import numpy as np
from tqdm import tqdm
from pathlib import Path
import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt
from python.cogtext.datasets.pubmed import PubMedDataLoader
from python.cogtext.similarity_matrix import get_similarity_matrix
import re

In [65]:
gpt3_model_id = 'ada'

In [67]:
# load and prep pubmed document
pubmed = PubMedDataLoader(preprocessed=False,
                          drop_low_occurred_labels=True).load()
pubmed = pubmed.query('pmid.notna() and abstract.notna() and title.notna()')
pubmed['abstract'] = pubmed['abstract'].apply(lambda x: x.replace('\n', ' '))
pubmed = pubmed.drop_duplicates(subset=['pmid'])


#### REMOVE VERY LONG ABSTRACTS; GPT-3 is limited to 2047 tokens per document

# remove a very long document that prevented GPT-3 to encode all the other documents
very_long_docs = pubmed['abstract'].str.len().sort_values()[:-11:-1]
pubmed.drop(index=very_long_docs.index, inplace=True)

# and just a heuristic to avoid GPT-3 error when encoding documents
pubmed = pubmed.query('abstract.str.len() < 3000')

# or a slower RegEx approach
# abstract_tokens = pubmed['abstract'].apply(lambda x: len(re.split('\W+|\s+', x)))
# pubmed = pubmed[abstract_tokens < 2000]

pubmed[['pmid']].to_csv(f'models/gpt3/abstracts_{gpt3_model_id}_pmids.npz')

print(f'* {pubmed.shape[0]} abstracts (pmids in models/gpt3/abstracts_{gpt3_model_id}_pmids.npz)')

* 382855 abstracts (pmids in models/gpt3/abstracts_ada_pmids.npz)


In [68]:
OUTPUT_PATH = Path(f'models/gpt3/abstracts_{gpt3_model_id}.npz')

N_PREFITTED_DOCUMENTS = 0

if OUTPUT_PATH.exists():
  available_embeddings = np.load(OUTPUT_PATH)['arr_0']
  N_PREFITTED_DOCUMENTS = available_embeddings.shape[0]
  
print(f'* {N_PREFITTED_DOCUMENTS} documents already embedded.')

* 0 documents already embedded.


In [71]:
gpt3_embeddings_dims = {
  'ada': 1024,
  'babbage': 2048,
  'curie': 4096,
  'davinci': 12288
}

batch_size = 100

model = openai.Engine(id=f'{gpt3_model_id}-similarity')

# @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def gpt3_embed(texts: list[str]):
  try:
    Z = model.embeddings(input=texts)#['data']['embedding']
    Z = [z['embedding'] for z in Z['data']]
    Z = np.array(Z)
  except Exception as e:
    print('GPT-3 failed! Filling the batch with zeros.', e)
    Z_dim = gpt3_embeddings_dims[gpt3_model_id]
    Z = np.zeros((len(texts), Z_dim))
  return Z

for i in tqdm(range(N_PREFITTED_DOCUMENTS, len(pubmed), batch_size), unit='batch'):
  batch = pubmed[i:i+batch_size]['abstract'].tolist()
  batch_embeddings = gpt3_embed(batch)
  
  # cache
  np.savez(
    f'models/gpt3/abstracts_{gpt3_model_id}_b{(int(i/batch_size)+1):05d}.npz',
    batch_embeddings)


100%|██████████| 3829/3829 [2:35:23<00:00,  2.43s/batch]  


The following code will convert the GPT-3 embedding to NetCDF and store it in the `models/abstracts_gpt3ada.nc` file. Use XArray to load the dataset.

In [9]:
import numpy as np
import pandas as pd
import xarray as xr

pmids = pd.read_csv('models/gpt3/abstracts_gpt3ada_pmids.csv', index_col=0)
pmids.index.name = 'original_index'

embeddings = np.load('models/abstracts_gpt3ada.npz')['arr_0']

# DEBUG embeddings.shape, pmids.shape

# create the dataset
dataset = xr.Dataset({
  'embeddings': (['pmid', 'embedding'], embeddings)
},
coords={
  'original_index': pmids.index.values,
  'pmid': pmids['pmid'].values
})

# store
dataset.to_netcdf('models/abstracts_gpt3ada.nc',)
                  # encoding={'embeddings':{'zlib':True, 'complevel':9}})

# DEBUG report dimensionalities
# dataset['pmid'].shape, dataset['original_index'].shape, dataset['embeddings'].shape

# open the dataset and print it for validation
xr.open_dataset('models/abstracts_gpt3ada.nc')