## Alan kay q&a data: CSV -> TXT

In [72]:
import pandas as pd

df = pd.read_csv('./data/quora_q&a_alan_kay.csv',sep=',')
df

Unnamed: 0,questions,answers
0,Computer scientist Edsger W. Dijkstra said tha...,I’ll start by saying that I don’t clearly unde...
1,Was one byte ever less than eight bits in the ...,When I started “programming as a job” in the U...
2,Is programming learned inductively or deductiv...,"""I don’t feel I completely understand this qu..."
3,What are your thoughts on a future where code ...,"First, it’s worth noting that the “syntax reco..."
4,Why did Japan’s Fifth Generation Computer Syst...,The answer by Marcus Triska sums up much of th...
...,...,...
686,What were Sophocles' contributions to theatric...,I thought the wikipedia article on Sophocles w...
687,Is a microservices architecture with RESTful A...,I was asked to write a history of Smalltalk fo...
688,Will the code on p.13 of the LISP 1.5 Programm...,How about Biology in a Computer Science class?...
689,Who was specifically responsible for the inven...,"Mr Rao has a good slant on this, in particular..."


In [73]:
with open("./data/alan_kay_knowledge/quora_q&a_alan_kay.txt", "w") as f:
    for index,row in df.iterrows():
        f.write(f"Interview's Question: { row['questions']}\n")
        f.write(f"Alan's kay Answer: {row['answers']}\n\n\n")

## Create Embeddings

In [None]:
import os
from dotenv import load_dotenv
import chromadb
from openai import OpenAI
from chromadb.utils import embedding_functions

load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_key, model_name="text-embedding-3-small",
)
chroma_client = chromadb.PersistentClient(path="./data/chroma_persistent_storage")
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(
    name=collection_name,embedding_function=openai_ef
)
client = OpenAI(api_key=openai_key)

# load documents from directory
def load_documents_from_directory(directory_path):
    # print("==== Loading documents from directory ====")
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(os.path.join(directory_path,filename), "r", encoding="utf-8") as file:
                documents.append({"id":filename,"text":file.read()})
    return documents

def split_text(text,chunk_size=1000,chunk_overlap=20):
    chunks = []
    start = 0
    while start <len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

def get_openai_embedding(text):
    response = client.embeddings.create(input=text, model="text-embedding-3-small")
    embedding = response.data[0].embedding
    print("==== Generating embeddings... ====")
    return embedding

def split_text_and_generate_embeddings():
    directory_path = "./data/alan_kay_knowledge/"
    documents = load_documents_from_directory(directory_path)
    print(f"loaded {len(documents)} files")

    chunked_documents = []
    for doc in documents:
        chunks = split_text(doc['text'])
        # print(f"== Splitting docs into chunks ==")
        for i, chunk in enumerate(chunks):
            chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})
            
    print(len(chunked_documents))

    for doc in chunked_documents:
        print("==== Generating embeddings... ====")
        doc["embedding"] = get_openai_embedding(doc["text"])
    

    for doc in chunked_documents:
        print("==== inserting chunks into db;; ====")
        collection.upsert(ids=[doc["id"]], documents=[doc["text"]],embeddings=[doc['embedding']])


In [75]:
# split_text_and_generate_embeddings()

loaded 1 files
276
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
=

## Try the API

In [None]:
import requests

question = input("Ask Alan Kay a question: ")

API_URL = "http://127.0.0.1:8000/ask/alan_kay"
def API_response(API_URL,question):
    try:
        response = requests.post(API_URL, json={"question": question})
        if response.status_code == 200:
            response = response.json()["answer"]
        else:
            response = "I'm sorry, but I couldn't process your request."
    except requests.exceptions.RequestException:
        response = "Error: Unable to reach the backend API."
    return response

response = API_response(API_URL,question)
print(response)

## Characters table 

In [None]:
import sqlite3

conn = sqlite3.connect('characters.db')
c = conn.cursor()

# Drop the existing characters table
c.execute('DROP TABLE IF EXISTS characters')

# Create a new characters table
c.execute('''
CREATE TABLE characters (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT,
    prompt TEXT,
    text_data TEXT,
    image BLOB,
    description TEXT
)
''')

conn.commit()
conn.close()