In [1]:
import os

current_year = '1946'
journals_path = '~/Documents/Harry_Howard_Journal'
input_file = os.path.join(journals_path, f'{current_year}/{current_year}-entries.json')
embeddings_file = os.path.join(journals_path, 'journal-entries-with-embeddings.csv')
print(os.path.exists(embeddings_file))

False


In [2]:
import os
import pandas as pd
import tiktoken

from openai.embeddings_utils import get_embedding
import openai
openai.api_key = os.environ.get('OPENAI_API_KEY')

In [3]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [4]:
# load & inspect dataset
df = pd.read_json(input_file)
df = df[['date', 'content']]
df = df.dropna()
df['combined'] = "Date: " + df['date'].astype(str).str.strip() + "; Text: " + df['content'].str.strip()

df.head(3)


Unnamed: 0,date,content,combined
0,1946-01-01,A new year dawns upon a torn and bleeding worl...,Date: 1946-01-01; Text: A new year dawns upon ...
1,1946-01-02,The biggest check I have ever drawn for 15 day...,Date: 1946-01-02; Text: The biggest check I ha...
2,1946-01-03,"Our state, as a state, marks its 50th birthday...","Date: 1946-01-03; Text: Our state, as a state,..."


In [5]:
encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
start_count = len(df)

df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

removed = start_count - len(df)
print(f'removed {removed}')

removed 0


In [6]:
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv(embeddings_file, mode='a', header=False, index=False)    # append mode
