In [1]:
# https://github.com/openai/openai-python/blob/main/examples/embeddings/Get_embeddings.ipynb
# %pip install openai

#Note
Output embedding model contains only one row per  document. All documents with missing PMID or abstracts or very long abstracts (>2000 tokens) are discarded.

# Input
- `data/pubmed/abstracts.csv.gz` contains raw un-preprocessed texts. Duplicate articles will be discarded to speed up model fitting.

# Outputs

- `models/abstracts_gpt3curie.npz` for the embedding weights one row per document.
- `models/abstracts_pmids_gpt3curie.csv` includes PMIDs for the rows of the above matrix. THis can be use to connect weights to the actual PubMed datasets.

In [2]:
import numpy as np
from tqdm import tqdm
from pathlib import Path
import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt
from python.cogtext.datasets.pubmed import PubMedDataLoader
from python.cogtext.similarity_matrix import get_similarity_matrix
import re

In [6]:
# load and prep pubmed document
pubmed = PubMedDataLoader(preprocessed=False,
                          drop_low_occurred_labels=True).load()
pubmed = pubmed.query('pmid.notna() and abstract.notna()')
pubmed['abstract'] = pubmed['abstract'].apply(lambda x: x.replace('\n', ' '))
pubmed = pubmed.drop_duplicates(subset=['pmid'])

# remove a very long document that prevented GPT-3 to encode all the other documents
very_long_doc_index = pubmed.query('abstract.str.len() == abstract.str.len().max()').index
pubmed.drop(index=very_long_doc_index, inplace=True)

# or a slower RegEx approach
# abstract_tokens = pubmed['abstract'].apply(lambda x: len(re.split('\W+|\s+', x)))
# pubmed = pubmed[abstract_tokens < 2000]

pubmed[['pmid']].to_csv('models/abstracts_pmids_gpt3curie.npz')

print(f'* Embedding {pubmed.shape[0]} abstracts (pmids in models/abstracts_pmids_gpt3curie.npz)')

* Embedding 385704 abstracts (pmids in models/abstracts_pmids_gpt3curie.npz)


In [7]:
OUTPUT_PATH = Path('models/abstracts_gpt3curie.npz')

N_PREFITTED_DOCUMENTS = 0

if OUTPUT_PATH.exists():
  available_embeddings = np.load(OUTPUT_PATH)['arr_0']
  N_PREFITTED_DOCUMENTS = available_embeddings.shape[0]
  
print(f'* {N_PREFITTED_DOCUMENTS} documents already embedded.')

* 0 documents already embedded.


In [10]:
batch_size = 100
embeddings_dim = 2048  # GPT-3 Curie
gpt3_model_id = 'ada'
model = openai.Engine(id=f'{gpt3_model_id}-similarity')
embeddings = []

# @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def gpt3_embed(texts: list[str]):
  try:
    Z = model.embeddings(input=texts)#['data']['embedding']
    Z = [z['embedding'] for z in Z['data']]
    Z = np.array(Z)
  except Exception as e :
    print('GPT-3 failed! Filling the batch with zeros.', e)
    Z = np.zeros((len(texts), embeddings_dim))
  return Z

for i in tqdm(range(N_PREFITTED_DOCUMENTS, len(pubmed), batch_size), unit='batch'):
  batch = pubmed[i:i+batch_size]['abstract'].tolist()
  batch_embeddings = gpt3_embed(batch)
  embeddings.append(batch_embeddings)

embeddings = np.vstack(embeddings)

if N_PREFITTED_DOCUMENTS > 0:
  embeddings = np.vstack([available_embeddings, embeddings])

np.savez(f'models/abstracts_gpt3{gpt3_model_id}.npz', embeddings)

  6%|▌         | 220/3858 [07:08<4:44:10,  4.69s/batch]