In [161]:
import os
import pandas as pd
from sqlalchemy import create_engine
import openai
import numpy

import hashlib

%load_ext dotenv
%dotenv
%reload_ext dotenv

openai.api_key = os.getenv('OPENAI_API_KEY')

env_config = {
    "db": {
      "user": os.getenv("POSTGRES_USER") or "",
      "dbname": os.getenv("POSTGRES_DB") or "",
      "host": os.getenv("POSTGRES_HOST") or "",
      "port": os.getenv("POSTGRES_PORT") or "",
    }
}

secret = {
  "db": {
    "password": os.getenv("POSTGRES_PASSWORD") or ""
  }
}

env_config


The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


{'db': {'user': 'awesome',
  'dbname': 'qgenforthelazies-dev',
  'host': 'localhost',
  'port': '5432'}}

In [162]:
# connect to db
def get_db():
  conf = env_config["db"]
  password = secret["db"]["password"]
  engine = create_engine(
    "postgresql://" + conf["user"] +":"+ password+ "@" + conf["host"] +":"+ conf["port"] +"/"+ conf["dbname"]
  )
  # conn = psycopg2.connect(
  #     host=os.getenv("POSTGRES_HOST"),
  #     port=os.getenv("POSTGRES_PORT"),
  #     user=os.getenv("POSTGRES_USER"),
  #     password=os.getenv("POSTGRES_PASSWORD"),
  #     dbname=os.getenv("POSTGRES_DB")
  # )
  return engine


In [163]:
# Raws, materials
raws = pd.DataFrame(
  [
      { "group": "sample-fantasy01", "version": "01", "file": "./raws/sample01.txt" },
  ]
) 

raws

Unnamed: 0,group,version,file
0,sample-fantasy01,1,./raws/sample01.txt


In [164]:
# read raws
def load_raws (filename):
  with open(filename, "r") as f:
      df = pd.DataFrame(f.readlines())
      return df.drop(df[df[0] == "\n"].index)

def raws_to_df (filename, group, version):
  df = pd.DataFrame({"text": load_raws(filename=filename)[0], "group": group, "version": version})
  df["hash"] = df["text"].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())
  return df

In [165]:
# save to db
def save_to_db (df):
    engine = get_db()
    df.to_sql('chunk-o-texts', engine, if_exists='replace', index=False)
    return df

def save_to_file (df):
    df.to_csv('./staging/chunk-o-texts.csv', index=False)
    return df


In [166]:
# save
group = raws.loc[0]
filename = group.file
df = raws_to_df(filename, group.group, group.version)

save_to_db(df)
save_to_file(df)


Unnamed: 0,text,group,version,hash
0,"In a land of dragons and magic, the kingdom of...",sample-fantasy01,1,0621c17894597a66153ded7a5665aa89
2,A group of brave warriors set out on a quest t...,sample-fantasy01,1,41e3da99a58286a2be5605b30bebc665
4,As they drew closer to the source of the darkn...,sample-fantasy01,1,a8bc979797a4488b6a851dee6830b6e7
6,"With peace restored, the kingdom of Lorem Ipsu...",sample-fantasy01,1,5f8d9aeab7822156c69811430520616f


In [167]:
def embeddingFromOpenAi(word, model="text-embedding-ada-002"):
    resp = openai.Embedding.create(
        input=word,
        model=model,
    )
    return resp['data'][0]['embedding']

In [168]:
# read from file
df = pd.read_csv("./staging/chunk-o-texts.csv")
# add embeddings from openai
df["openaiembeddings"] = df.text.apply(lambda x: embeddingFromOpenAi(x))

In [169]:
save_to_db(df) 
save_to_file(df)

Unnamed: 0,text,group,version,hash,openaiembeddings
0,"In a land of dragons and magic, the kingdom of...",sample-fantasy01,1,0621c17894597a66153ded7a5665aa89,"[0.007870780304074287, -0.013575596734881401, ..."
1,A group of brave warriors set out on a quest t...,sample-fantasy01,1,41e3da99a58286a2be5605b30bebc665,"[-0.0079183429479599, -0.03138423711061478, -0..."
2,As they drew closer to the source of the darkn...,sample-fantasy01,1,a8bc979797a4488b6a851dee6830b6e7,"[0.011024783365428448, -0.02303827553987503, -..."
3,"With peace restored, the kingdom of Lorem Ipsu...",sample-fantasy01,1,5f8d9aeab7822156c69811430520616f,"[0.0024935307446867228, -0.021801359951496124,..."


In [180]:
# add similarities
def add_similarities(df, compareToWordEmbeddings, sim_colname: str = "similarities", embedding_colname: str= "openaiembeddings"):
    df[sim_colname] = df[embedding_colname].apply(lambda x: numpy.dot(x, compareToWordEmbeddings))
    return df.sort_values(sim_colname, ascending=False)

In [181]:

def compare(word, isPrintSimSum=False):
  base = embeddingFromOpenAi(word)
  sim = add_similarities(df, base)
  print("base: "+ word)
  print(sim.iloc[0])
  print(sim.iloc[0].text)
  if isPrintSimSum:
    print(sim)




In [182]:
compare("kingdom")

base: kingdom
text                In a land of dragons and magic, the kingdom of...
group                                                sample-fantasy01
version                                                             1
hash                                 0621c17894597a66153ded7a5665aa89
openaiembeddings    [0.007870780304074287, -0.013575596734881401, ...
similarities                                                 0.812759
Name: 0, dtype: object
In a land of dragons and magic, the kingdom of Lorem Ipsum stood tall. The ruler, King Ipsum, was a just and fair leader, beloved by his people. But an ancient evil had awakened, and dark forces threatened to destroy the kingdom.



In [183]:
compare("peaceful")

base: peaceful
text                With peace restored, the kingdom of Lorem Ipsu...
group                                                sample-fantasy01
version                                                             1
hash                                 5f8d9aeab7822156c69811430520616f
openaiembeddings    [0.0024935307446867228, -0.021801359951496124,...
similarities                                                 0.811483
Name: 3, dtype: object
With peace restored, the kingdom of Lorem Ipsum flourished once more. And the warriors were hailed as heroes, their names forever etched in the annals of history.


In [184]:
compare("lost the battle, win the war")

base: lost the battle, win the war
text                As they drew closer to the source of the darkn...
group                                                sample-fantasy01
version                                                             1
hash                                 a8bc979797a4488b6a851dee6830b6e7
openaiembeddings    [0.011024783365428448, -0.02303827553987503, -...
similarities                                                 0.831405
Name: 2, dtype: object
As they drew closer to the source of the darkness, they realized that they were not just fighting for Lorem Ipsum, but for the fate of the entire world. The final showdown was epic and legendary, with the warriors emerging victorious against overwhelming odds.



In [185]:
compare("Who is th ruler?")

base: Who is th ruler?
text                In a land of dragons and magic, the kingdom of...
group                                                sample-fantasy01
version                                                             1
hash                                 0621c17894597a66153ded7a5665aa89
openaiembeddings    [0.007870780304074287, -0.013575596734881401, ...
similarities                                                 0.803877
Name: 0, dtype: object
In a land of dragons and magic, the kingdom of Lorem Ipsum stood tall. The ruler, King Ipsum, was a just and fair leader, beloved by his people. But an ancient evil had awakened, and dark forces threatened to destroy the kingdom.



In [186]:
compare("What is the conclusion of the story?", isPrintSimSum=True)

base: What is the conclusion of the story?
text                As they drew closer to the source of the darkn...
group                                                sample-fantasy01
version                                                             1
hash                                 a8bc979797a4488b6a851dee6830b6e7
openaiembeddings    [0.011024783365428448, -0.02303827553987503, -...
similarities                                                 0.811108
Name: 2, dtype: object
As they drew closer to the source of the darkness, they realized that they were not just fighting for Lorem Ipsum, but for the fate of the entire world. The final showdown was epic and legendary, with the warriors emerging victorious against overwhelming odds.

                                                text             group  \
2  As they drew closer to the source of the darkn...  sample-fantasy01   
1  A group of brave warriors set out on a quest t...  sample-fantasy01   
3  With peace restored, the ki

In [187]:
compare("Who are they fighting for?")

base: Who are they fighting for?
text                As they drew closer to the source of the darkn...
group                                                sample-fantasy01
version                                                             1
hash                                 a8bc979797a4488b6a851dee6830b6e7
openaiembeddings    [0.011024783365428448, -0.02303827553987503, -...
similarities                                                 0.803295
Name: 2, dtype: object
As they drew closer to the source of the darkness, they realized that they were not just fighting for Lorem Ipsum, but for the fate of the entire world. The final showdown was epic and legendary, with the warriors emerging victorious against overwhelming odds.



In [188]:
compare("The leader of the party")

base: The leader of the party
text                In a land of dragons and magic, the kingdom of...
group                                                sample-fantasy01
version                                                             1
hash                                 0621c17894597a66153ded7a5665aa89
openaiembeddings    [0.007870780304074287, -0.013575596734881401, ...
similarities                                                 0.756501
Name: 0, dtype: object
In a land of dragons and magic, the kingdom of Lorem Ipsum stood tall. The ruler, King Ipsum, was a just and fair leader, beloved by his people. But an ancient evil had awakened, and dark forces threatened to destroy the kingdom.

