In [None]:
!pip install -qU openai pinecone-client datasets
!pip install --upgrade tiktoken
!pip install wikipedia-api

In [2]:
import sqlite3
import os
import json
import os

class SQLiteHandler:
    def __init__(self, db_name):
        self.db_name = db_name
        self.db_table_brain = 'brain'
        self.conn = None
        self.cursor = None
        self.create_connection_and_table()

    def create_connection_and_table(self):
        if not os.path.exists(self.db_name):
            self.conn = sqlite3.connect(self.db_name)
            self.create_table()
        else:
            self.conn = sqlite3.connect(self.db_name)
            if not self.check_table_exists(self.db_table_brain):
                self.create_table()
        self.cursor = self.conn.cursor()
        #self.conn.set_trace_callback(print)

    def check_table_exists(self, table_name):
        self.cursor = self.conn.cursor()
        self.cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';")
        return bool(self.cursor.fetchall())

    def create_table(self):
        query = f"""CREATE TABLE {self.db_table_brain} (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        message TEXT,
                        entities TEXT
                    );"""
        self.cursor = self.conn.cursor()
        self.cursor.execute(query)
        self.conn.commit()

    def insert_data(self, table_name, data):
        data = {k: json.dumps(v) if isinstance(v, list) else v for k, v in data.items()}
        columns = ', '.join(data.keys())
        values = ', '.join(['?'] * len(data))
        query = f"INSERT INTO {table_name} ({columns}) VALUES ({values})"
        self.cursor.execute(query, list(data.values()))
        self.conn.commit()

    def close_connection(self):
        self.conn.close()

    def print_match(self, search_for):
        self.cursor.execute(f"SELECT body FROM {self.db_table_brain}")
        rows = self.cursor.fetchall()
        for row in rows:
            print(f"{row}")

    def get_id(self):
        self.cursor.execute("SELECT last_insert_rowid()")
        return self.cursor.fetchone()[0]

    def print_select(self, select_txt):
        self.cursor.execute(f"{select_txt}")
        rows = self.cursor.fetchall()
        for row in rows:
            print(f"{row}")

    def print_data(self):
        self.cursor.execute(f"SELECT body FROM {self.db_table_brain}")
        rows = self.cursor.fetchall()
        for row in rows:
            print(f"{row}")

In [None]:
# initalize local database (LTSM) to store notes
DATABASE = os.getcwd() + "/brain02.db"
print(DATABASE)
handler = SQLiteHandler(DATABASE)
# do you want to load and index data?
LOAD = True

In [None]:
import os, openai, tiktoken
import pinecone

openai.api_key = os.environ.get("OPENAI_API_KEY")
picone_api_key = os.environ.get("PINECONE_API_KEY")
picone_env = "us-west1-gcp-free"

MODEL3 = "gpt-3.5-turbo"
MODEL4 = "gpt-4"

In [5]:
#scratch some text data (wiki scratacher) part 1 of 2
import wikipediaapi

def get_page_text(page_title, language):
    wiki_wiki = wikipediaapi.Wikipedia(language)

    page = wiki_wiki.page(page_title)
    if page.exists():
        return page.text
    else:
        return None
    


In [7]:
# Prepare some function from openai to use them later

# chat complition
def chat_completion(model, system_text, user_text):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": system_text},
            {"role": "user", "content": user_text},
        ],
    )
    message = response["choices"][0]["message"]["content"]
    return message
# provide number of tokens in string
def num_tokens_from_string(string: str, model: str = "gpt-3.5-turbo-0301") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# provide number of tokens in real chat conversation of openai format
def num_tokens_from_messages(messages, model=MODEL3):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [None]:
# let's check how it works for a simple string and wiki page for Warszawa
#print(num_tokens_from_string("testing openai API", MODEL3))

In [18]:
# create an embeding
def get_embeddings(input):
    embed_model = "text-embedding-ada-002"
    res = openai.Embedding.create(
        input=input,
        engine=embed_model
    )
    return(res)

In [None]:
# prepare picone connection
pinecone.init(api_key=picone_api_key, environment=picone_env)
index_name="qa"
index_lst = pinecone.describe_index(index_name) #GRPCIndex(index_name)
print(index_lst)
index = pinecone.Index(index_name)

In [10]:
# try to split the text into smaller chunks
# https://python.langchain.com/en/latest/modules/indexes/text_splitters/getting_started.html
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap  = 100,
    length_function = num_tokens_from_string,
)


In [None]:
if LOAD:
    #scratch some text data (get data) part 2 of 2
    language = "pl"  
    for page_titel in ["Mikołaj_Kopernik"]: # Kraków Warszawa Mikołaj_Kopernik
        page_txt = get_page_text(page_titel, language)
        if page_txt:
            print(f"Processing {page_titel} page...")
            texts = text_splitter.create_documents([page_txt])
            # add to local brain and embedings to picone
            for chunk in texts:
                # print(chunk.page_content)
                handler.insert_data(handler.db_table_brain, {"message" : chunk.page_content})
                handler.conn.commit()
                # extract embeddings to a list
                embeds = [record['embedding'] for record in (get_embeddings([chunk.page_content]))['data']][0]
                chunk_id = str(handler.get_id())
                to_upsert = (chunk_id, embeds, {})
                # print (chunk_id)
                
                upsert_response = index.upsert(
                    vectors=[to_upsert],
                #     namespace="example-namespace"
                )
        else:
            print(f"Page {page_titel} does not exist.")
handler.close_connection()

In [45]:
# test query
for query_txt in [
    "Pierwszymi osadami powstałymi w obecnych granicach administracyjnych Warszawy były",
    "Jakie osady powstały w granicach Warszawy?",
    "XXXX"    
]:
    embeds = [record['embedding'] for record in (get_embeddings([query_txt]))['data']][0]
    query_response = index.query(
        top_k=3,
        include_values=True,
        include_metadata=False,
        vector=embeds
    )["matches"]
    print(f"text: {query_txt}")
    for item in query_response:
        print(f"id: {item.id}, score: {item.score}")


text: Pierwszymi osadami powstałymi w obecnych granicach administracyjnych Warszawy były
id: 14, score: 0.869876146
id: 16, score: 0.86975044
id: 23, score: 0.864600122
text: Jakie osady powstały w granicach Warszawy?
id: 16, score: 0.878372133
id: 14, score: 0.875454962
id: 23, score: 0.870916069
text: XXXX
id: 96, score: 0.746123075
id: 19, score: 0.745900571
id: 116, score: 0.745182812
