In [35]:
import os
import numpy as np
import pandas as pd
from openai import OpenAI
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))

In [36]:
def get_embedding(text, model="text-embedding-3-small"):
   # Generates embedding for a text
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
 
def get_embeddings(
    list_of_text: list[str], model="text-embedding-3-small", **kwargs) -> list[list[float]]:
    assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."

    # replace newlines, which can negatively affect performance.
    list_of_text = [text.replace("\n", " ") for text in list_of_text]

    data = client.embeddings.create(input=list_of_text, model=model, **kwargs).data
    return [d.embedding for d in data]

In [37]:
#df = pd.read_csv("words.csv")
#df

In [38]:
# Returns embeddings for the words in 'words.csv' and stores them in 'embedding'. Saves this as a new csv file
#df['embedding'] = df['text'].apply(get_embedding)
#df.to_scv('word_embeddings.csv')

In [39]:
#df = pd.read_csv('word_embeddings.csv')
#df['embedding'] = df['embedding'].apply(eval).apply(np.array)
#df

In [40]:
# Load wanted data. Here is a speech
df = pd.read_csv("fed-speech.csv")
df.head()

Unnamed: 0,text
0,Good afternoon
1,My colleagues and I are strongly committed to ...
2,We have both the tools that we need and the re...
3,Price stability is the responsibility of the F...
4,"Without price stability, the economy does not ..."


In [41]:
# Get the embeddings of the speech. Save to csv to reduce embedding function calls
#df['embeddings'] = df['text'].apply(get_embedding)
#df.to_csv('fed-speech-embeddings')

In [84]:
# Load the csv file
df = pd.read_csv('fed-speech-embeddings')
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) # What does this do??
df.head()

Unnamed: 0.1,Unnamed: 0,text,embeddings
0,0,Good afternoon,"[0.03404167294502258, -0.030998127534985542, -..."
1,1,My colleagues and I are strongly committed to ...,"[0.011237340047955513, 0.014179651625454426, 0..."
2,2,We have both the tools that we need and the re...,"[0.010302674025297165, 0.01715744100511074, 0...."
3,3,Price stability is the responsibility of the F...,"[-0.013447479344904423, -0.006005288101732731,..."
4,4,"Without price stability, the economy does not ...","[-0.01848752796649933, 0.005304287187755108, 0..."


In [85]:
# Search for something in the speech and create an embedding of the search sentence
search_sentence = "fomc pce august"
search_embedded = get_embedding(search_sentence)

In [86]:
# Compute similarities between the speech and the search
df['similarities'] = df['embeddings'].apply(lambda x : cosine_similarity(x,search_embedded))
df.head()

Unnamed: 0.1,Unnamed: 0,text,embeddings,similarities
0,0,Good afternoon,"[0.03404167294502258, -0.030998127534985542, -...",0.211628
1,1,My colleagues and I are strongly committed to ...,"[0.011237340047955513, 0.014179651625454426, 0...",0.392007
2,2,We have both the tools that we need and the re...,"[0.010302674025297165, 0.01715744100511074, 0....",0.298283
3,3,Price stability is the responsibility of the F...,"[-0.013447479344904423, -0.006005288101732731,...",0.347545
4,4,"Without price stability, the economy does not ...","[-0.01848752796649933, 0.005304287187755108, 0...",0.255388


In [87]:
# Sort by most similar
df = df.sort_values("similarities", ascending=False)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0.1,Unnamed: 0,text,embeddings,similarities
0,23,"Over the 12 months ending in September, total ...","[-0.028180072084069252, 0.04317083954811096, 0...",0.572014
1,6,"Today, the FOMC raised our policy interest rat...","[-0.005314371082931757, -0.00754205510020256, ...",0.491557
2,24,The recent inflation data again have come in h...,"[0.016138914972543716, -0.0019731884822249413,...",0.463067
3,52,We at the Fed will do everything we can to ach...,"[-0.0073754191398620605, -0.006112131755799055...",0.457727
4,13,Recent indicators point to modest growth of sp...,"[0.04072889685630798, 0.04065309837460518, 0.0...",0.454602


In [88]:
# Print top three results
for i in [0,1,2]:
    text = df['text'][i]
    print(f'Suggestion {i}: {text}')
    print('\n')

Suggestion 0: Over the 12 months ending in September, total PCE prices rose at 6.2%, excluding the volatile food and energy categories, core PCE prices rose at 5.1%


Suggestion 1: Today, the FOMC raised our policy interest rate by 75 basis points and we continue to anticipate that ongoing increases will be appropriate


Suggestion 2: The recent inflation data again have come in higher than expected


