In [8]:
# extract text from a pdf file
import PyPDF2

def extract_text_from_pdf(pdf_file):
    pdf_file = open(pdf_file, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page_num].extract_text()
    return text

pdf_file = '../data/Arsenal_FC.pdf'

text = extract_text_from_pdf(pdf_file)

In [9]:
# chunking the text into paragraphs with 1000 characters each
# over laps 200 characters
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
# output the paragraphs to txt file


In [10]:
# import openai
from openai import OpenAI

client = OpenAI()

def make_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

embeddings = []
for chunk in chunks:
   embedding = make_embedding(chunk)
   embeddings.append(embedding)

KeyboardInterrupt: 

In [None]:
# create a vector database for the embeddings
# create a function to search for similar text
import sqlite3
# import tensorflow as tf
import numpy as np
import pickle

def create_database(database_name):
    conn = sqlite3.connect(database_name)
    c = conn.cursor()
    c.execute("SELECT count(name) FROM sqlite_master WHERE type='table' AND name='embeddings'")
    if c.fetchone()[0] == 0:
        # If the table doesn't exist, create it
        c.execute('''CREATE TABLE embeddings
                     (id INTEGER PRIMARY KEY,
                     text TEXT,
                     embedding BLOB)''')
    conn.commit()
    conn.close()

def insert_embedding(database_name, text, embedding):
    conn = sqlite3.connect(database_name)
    c = conn.cursor()
    
    embedding_serialized = pickle.dumps(embedding)
    c.execute("INSERT INTO embeddings (text, embedding) VALUES (?, ?)", (text, embedding_serialized))
    conn.commit()
    conn.close()
    


database_name = 'arsenal_embedding_database.db'
create_database(database_name)

for chunk, embedding in zip(chunks, embeddings):
    insert_embedding(database_name, chunk, embedding)
    


In [None]:
def get_embedding(database_name, text):
    conn = sqlite3.connect(database_name)
    c = conn.cursor()
    c.execute("SELECT embedding FROM embeddings WHERE text=?", (text,))
    result = c.fetchone()
    conn.close()
    if result:
        return result[0]
    else:
        return None
    
for chunk in chunks:
    embedding = pickle.loads(get_embedding(database_name, chunk))
    print(embedding)
    break

[-0.02189594693481922, 0.0025608628056943417, 0.03237761929631233, -0.01289428025484085, 0.02072005346417427, 0.008297145366668701, -0.07331087440252304, -0.019017035141587257, -0.023984171450138092, 0.05356397479772568, -0.031080082058906555, -0.02226088009774685, -0.011809620074927807, -0.03787187859416008, 0.018733199685811996, 0.011485235765576363, 0.0023137731477618217, 0.046995192766189575, -0.056077953428030014, -0.07870376855134964, 0.05465877428650856, -0.01064386311918497, -0.00744056748226285, -0.04468395188450813, 0.045008335262537, 0.025950752198696136, -0.012701677158474922, -0.03756776824593544, -0.05437493696808815, -0.0022415469866245985, 0.011860305443406105, -0.03302638605237007, -0.004820149391889572, 0.030188024044036865, -0.016209082677960396, -0.03284392133355141, 0.0445217601954937, 0.004191654734313488, -0.008058925159275532, -0.02222033217549324, 0.011089892126619816, 0.015215655788779259, -0.055429186671972275, -0.03335077315568924, -0.05551028251647949, -0.0

In [None]:
test_query = "tell me about arsenal football club."
test_embedding = make_embedding(test_query)

def search_similar_text(database_name, query_embedding, num_results=5):
    conn = sqlite3.connect(database_name)
    c = conn.cursor()
    c.execute("SELECT text, embedding FROM embeddings")
    results = c.fetchall()
    conn.close()
    results = [(text, pickle.loads(embedding)) for text, embedding in results]
    
    # calculate the cosine similarity
    similarities = []
    for text, embedding in results:
        similarity = np.dot(query_embedding, embedding)/(np.linalg.norm(query_embedding)*np.linalg.norm(embedding))
        similarities.append((text, embedding, similarity))
    similarities.sort(key=lambda x: x[2], reverse=True)
    # get the top 5 similar texts
    return similarities[:num_results]
    

context = search_similar_text(database_name, test_embedding)
context = [text for text, embedding, similarity in context]

In [None]:
# call openai's llm api to get the answer
# support the model using the context just generated

def get_response(client, system_content="", assistant_content="", user_content="", model="gpt-3.5-turbo"):
    
    chat_completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_content},
            {"role": "assistant", "content": assistant_content},
            {"role": "user", "content": user_content}
        ]
    )
    return chat_completion

client = OpenAI()
response = get_response(
    client,
    system_content="Answer the query using the context provided. Be succinct.",
    # assistant_content="".join(context),
    user_content=test_query
)

In [None]:
# get the content of the response
response = response.choices[0].message.content
response

'Arsenal Football Club is a professional football club based in London, England. They compete in the Premier League, one of the top football leagues in the world. Arsenal is known for their successful history, iconic red and white kits, and their home stadium, the Emirates Stadium.'