In [1]:
import os
import numpy as np
import pandas as pd

from openai import OpenAI
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))

In [2]:
def get_embedding(text, model="text-embedding-3-small"):
   # Generates embedding for a text
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
 
def get_embeddings(
    list_of_text: list[str], model="text-embedding-3-small", **kwargs) -> list[list[float]]:
    assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."

    # replace newlines, which can negatively affect performance.
    list_of_text = [text.replace("\n", " ") for text in list_of_text]

    data = client.embeddings.create(input=list_of_text, model=model, **kwargs).data
    return [d.embedding for d in data]

In [3]:
#df = pd.read_csv("words.csv")
#df

In [4]:
# Returns embeddings for the words in 'words.csv' and stores them in 'embedding'. Saves this as a new csv file
#df['embedding'] = df['text'].apply(get_embedding)
#df.to_scv('word_embeddings.csv')

In [5]:
#df = pd.read_csv('word_embeddings.csv')
#df['embedding'] = df['embedding'].apply(eval).apply(np.array)
#df

In [19]:
# Load wanted data. Here is a speech
df = pd.read_csv("fed-speech.csv")
#df.head()

In [20]:
# Get the embeddings of the speech. Save to csv to reduce embedding function calls
#df['embeddings'] = df['text'].apply(get_embedding)
#df.to_csv('fed-speech-embeddings.csv')

In [18]:
# Load the csv file
#df = pd.read_csv('fed-speech-embeddings')
#df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) # What does this do??
#df.head()

In [9]:
# Search for something in the speech and create an embedding of the search sentence
#search_sentence = "august"
#search_embedded = get_embedding(search_sentence)

In [10]:
# Compute similarities between the speech and the search
#df['similarities'] = df['embeddings'].apply(lambda x : cosine_similarity(x,search_embedded))
#df.head()

In [11]:
# Sort by most similar
#df = df.sort_values("similarities", ascending=False)
#df = df.reset_index(drop=True)
#df.head()

In [12]:
# Print top three results
#for i in [0,1,2]:
#    text = df['text'][i]
#    print(f'Suggestion {i}: {text}')
#    print('\n')

In [22]:
def get_most_similar(text, df, n=3):
    search_embedded = get_embedding(text)
    df['similarities'] = df['embeddings'].apply(lambda x : cosine_similarity(x,search_embedded))
    df = df.sort_values("similarities", ascending=False)
    df = df.reset_index(drop=True)
    for i in range(n):
        print(f'Suggestion {i}: {df["text"][i]}')


def get_most_similar_unique(text, df, n=1):
    #  Split the input text into unique words
    words = set(text.split())
    # Iterate over each word and get the most similar embeddings
    for word in words:
        get_most_similar(word, df, n)
        
def load_embeddings_data(file):
    df = pd.read_csv(file)
    df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
    return df

In [29]:
search = "PCE fomc august prices"
df = load_embeddings_data('fed-speech-embeddings.csv')

In [24]:
get_most_similar(search,df)

Suggestion 0: Over the 12 months ending in September, total PCE prices rose at 6.2%, excluding the volatile food and energy categories, core PCE prices rose at 5.1%
Suggestion 1: Today, the FOMC raised our policy interest rate by 75 basis points and we continue to anticipate that ongoing increases will be appropriate
Suggestion 2: We at the Fed will do everything we can to achieve our maximum employment and price stability goals


In [30]:
get_most_similar_unique(search,df)

Suggestion 0: Job gains have been robust with employment rising by an average of 289,000 jobs per month over August and September
Suggestion 0: Today, the FOMC raised our policy interest rate by 75 basis points and we continue to anticipate that ongoing increases will be appropriate
Suggestion 0: Over the 12 months ending in September, total PCE prices rose at 6.2%, excluding the volatile food and energy categories, core PCE prices rose at 5.1%
Suggestion 0: Price pressures remained evident across a broad range of goods and services


In [28]:
get_most_similar("the federal open market comittee",df,n=3)

Suggestion 0: At today's meeting, the committee raised the target range for the federal funds rate by 75 basis points, and we are continuing the process of significantly reducing the size of our balance sheet, which plays an important role in firming the stance of monetary policy
Suggestion 1: We at the Fed will do everything we can to achieve our maximum employment and price stability goals
Suggestion 2: Today, the FOMC raised our policy interest rate by 75 basis points and we continue to anticipate that ongoing increases will be appropriate
