In [19]:
import os
import pandas as pd
from sqlalchemy import create_engine
import openai
import numpy

%load_ext dotenv
%dotenv
%reload_ext dotenv

openai.api_key = os.getenv('OPENAI_API_KEY')

print(os.getenv("POSTGRES_USER"))
print(os.getenv("POSTGRES_DB"))
print(os.getenv("POSTGRES_HOST"))
print(os.getenv("POSTGRES_PORT"))


The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
awesome
qgenforthelazies-dev
localhost
5432


In [2]:
# connect to db
def get_db():
  engine = create_engine(
    "postgresql://" + os.getenv("POSTGRES_USER") +":"+ os.getenv("POSTGRES_PASSWORD") + "@" + os.getenv("POSTGRES_HOST") +":"+ os.getenv("POSTGRES_PORT") +"/"+ os.getenv("POSTGRES_DB")
  )
  # conn = psycopg2.connect(
  #     host=os.getenv("POSTGRES_HOST"),
  #     port=os.getenv("POSTGRES_PORT"),
  #     user=os.getenv("POSTGRES_USER"),
  #     password=os.getenv("POSTGRES_PASSWORD"),
  #     dbname=os.getenv("POSTGRES_DB")
  # )
  return engine


In [3]:
# read raws
def load_raws ():
  with open("./raws/sample01.txt", "r") as f:
      df = pd.DataFrame(f.readlines())
      return df.drop(df[df[0] == "\n"].index)

In [14]:
# save to db
def save_to_db (df):
    # df = pd.DataFrame({"text": load_raws()[0]})
    engine = get_db()
    df.to_sql('chunk-o-texts', engine, if_exists='replace', index=False)
    return df

def save_to_file (df):
    # df = pd.DataFrame({"text": load_raws()[0]})
    df.to_csv('./staging/chunk-o-texts.csv', index=False)
    return df


In [15]:
# save
df = pd.DataFrame({"text": load_raws()[0]})
save_to_db(df)
save_to_file(df)


Unnamed: 0,text
0,"In a land of dragons and magic, the kingdom of..."
2,A group of brave warriors set out on a quest t...
4,As they drew closer to the source of the darkn...
6,"With peace restored, the kingdom of Lorem Ipsu..."


In [16]:
def embeddingFromOpenAi(word, model="text-embedding-ada-002"):
    resp = openai.Embedding.create(
        input=word,
        model=model,
    )
    return resp['data'][0]['embedding']

In [17]:
# read from file
df = pd.read_csv("./staging/chunk-o-texts.csv")

In [None]:
# add embeddings from openai
df["openaiembeddings"] = df.text.apply(lambda x: embeddingFromOpenAi(x))

In [18]:
save_to_db(df) 
save_to_file(df)

Unnamed: 0,text,openaiembeddings
0,"In a land of dragons and magic, the kingdom of...","[0.007870780304074287, -0.013575596734881401, ..."
2,A group of brave warriors set out on a quest t...,"[-0.0079183429479599, -0.03138423711061478, -0..."
4,As they drew closer to the source of the darkn...,"[0.011024783365428448, -0.02303827553987503, -..."
6,"With peace restored, the kingdom of Lorem Ipsu...","[0.0025072002317756414, -0.021801741793751717,..."


In [24]:
# add similarities
def add_similarities(df, compareToWordEmbeddings, sim_colname: str = "similarities", embedding_colname: str= "openaiembeddings"):
    df[sim_colname] = df[embedding_colname].apply(lambda x: numpy.dot(x, compareToWordEmbeddings))
    return df.sort_values(sim_colname, ascending=False)

In [25]:
base = embeddingFromOpenAi("kingdom")
add_similarities(df, base)

Unnamed: 0,text,openaiembeddings,similarities
0,"In a land of dragons and magic, the kingdom of...","[0.007870780304074287, -0.013575596734881401, ...",0.812756
6,"With peace restored, the kingdom of Lorem Ipsu...","[0.0025072002317756414, -0.021801741793751717,...",0.805126
2,A group of brave warriors set out on a quest t...,"[-0.0079183429479599, -0.03138423711061478, -0...",0.804388
4,As they drew closer to the source of the darkn...,"[0.011024783365428448, -0.02303827553987503, -...",0.777184


In [26]:
base = embeddingFromOpenAi("peace")
add_similarities(df, base)

Unnamed: 0,text,openaiembeddings,similarities
6,"With peace restored, the kingdom of Lorem Ipsu...","[0.0025072002317756414, -0.021801741793751717,...",0.821941
4,As they drew closer to the source of the darkn...,"[0.011024783365428448, -0.02303827553987503, -...",0.783918
2,A group of brave warriors set out on a quest t...,"[-0.0079183429479599, -0.03138423711061478, -0...",0.782836
0,"In a land of dragons and magic, the kingdom of...","[0.007870780304074287, -0.013575596734881401, ...",0.773935


In [27]:
base = embeddingFromOpenAi("peaceful kingdom")
add_similarities(df, base)

Unnamed: 0,text,openaiembeddings,similarities
6,"With peace restored, the kingdom of Lorem Ipsu...","[0.0025072002317756414, -0.021801741793751717,...",0.841804
0,"In a land of dragons and magic, the kingdom of...","[0.007870780304074287, -0.013575596734881401, ...",0.813046
2,A group of brave warriors set out on a quest t...,"[-0.0079183429479599, -0.03138423711061478, -0...",0.79703
4,As they drew closer to the source of the darkn...,"[0.011024783365428448, -0.02303827553987503, -...",0.772863


In [28]:
base = embeddingFromOpenAi("lost the battle, win the war")
add_similarities(df, base)

Unnamed: 0,text,openaiembeddings,similarities
4,As they drew closer to the source of the darkn...,"[0.011024783365428448, -0.02303827553987503, -...",0.831405
2,A group of brave warriors set out on a quest t...,"[-0.0079183429479599, -0.03138423711061478, -0...",0.804595
6,"With peace restored, the kingdom of Lorem Ipsu...","[0.0025072002317756414, -0.021801741793751717,...",0.79929
0,"In a land of dragons and magic, the kingdom of...","[0.007870780304074287, -0.013575596734881401, ...",0.759536
