In [2]:
import os
import pandas as pd
import tiktoken

from openai.embeddings_utils import get_embedding
import openai
openai.api_key = os.environ.get('OPENAI_API_KEY')

In [3]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [4]:
# load & inspect dataset
input_datapath = '../json/1948-entries.json'  # to save space, we provide a pre-filtered dataset
#df = pd.read_csv(input_datapath, index_col=0)
df = pd.read_json(input_datapath)
df = df[['date', 'text']]
df = df.dropna()
df['combined'] = "Date: " + df['date'].astype(str).str.strip() + "; Text: " + df['text'].str.strip()

df.head(2)


Unnamed: 0,date,text,combined
0,1948-01-01,"As a new year dawns, bringing to a close a mos...","Date: 1948-01-01; Text: As a new year dawns, b..."
1,1948-01-02,There will very probably be many changes at th...,Date: 1948-01-02; Text: There will very probab...


In [5]:
encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]
len(df)

169

In [7]:
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("./1948-journal-entries-with-embeddings.csv")