In [22]:
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from sklearn.metrics.pairwise import cosine_similarity

import sqlite3
import pickle
from openai import OpenAI
import numpy as np

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [9]:
def extract_text(pdf_files):
    """
    Function to extract the text from a PDF file

    Args:
        pdf_file (file): The PDF files to extract the text from

    Returns:
        text (str): The extracted text from the PDF file
    """

    # Initialize the raw text variable
    text = ""

    # Iterate over the documents
    for pdf_file in pdf_files:

        # Read the PDF file
        pdf_reader = PdfReader(pdf_file)

        # Extract the text from the PDF pages and add it to the raw text variable
        for page in pdf_reader.pages:
            text += page.extract_text()
    
    return text

In [10]:
def get_chunks(text):
    """
    Function to get the chunks of text from the raw text

    Args:
        text (str): The raw text from the PDF file

    Returns:
        chunks (list): The list of chunks of text
    """

    # Initialize the text splitter
    splitter = CharacterTextSplitter(
        separator="\n", # Split the text by new line
        chunk_size=1000, # Split the text into chunks of 1000 characters
        chunk_overlap=200, # Overlap the chunks by 200 characters
        length_function=len # Use the length function to get the length of the text
    )

    # Get the chunks of text
    chunks = splitter.split_text(text)

    return chunks

In [11]:
def make_embeddings(client, chunks):
    '''
    This function creates embeddings for the chunks of text using the OpenAI API.
    '''
    
    def _make_embedding(client, chunk, model="text-embedding-3-small"):
        chunk = chunk.replace("\n", " ")
        return client.embeddings.create(input = [chunk], model=model).data[0].embedding
    
    embeddings = []
    for chunk in chunks:
        embedding = _make_embedding(client, chunk)
        embeddings.append(embedding)
    return embeddings

In [12]:
def create_database(database_name):
    '''
    This funciton creates a database to store the embeddings.
    Columns: id, text, embedding
    '''
    conn = sqlite3.connect(database_name)
    c = conn.cursor()
    c.execute("SELECT count(name) FROM sqlite_master WHERE type='table' AND name='embeddings'")
    if c.fetchone()[0] == 0:
        # If the table doesn't exist, create it
        c.execute('''CREATE TABLE embeddings
                     (id INTEGER PRIMARY KEY,
                     text TEXT,
                     embedding BLOB)''')
    conn.commit()
    conn.close()

In [None]:
def insert_embedding(database_name, text, embedding):
    '''
    This function inserts the text and its embedding into the database.
    '''
    conn = sqlite3.connect(database_name)
    c = conn.cursor()
    
    embedding_serialized = pickle.dumps(embedding)
    c.execute("INSERT INTO embeddings (text, embedding) VALUES (?, ?)", (text, embedding_serialized))
    conn.commit()
    conn.close()

In [None]:
def search_similar_text(database_name, query_embedding, num_results=5):
    '''
    This function performs the 
    '''
    conn = sqlite3.connect(database_name)
    c = conn.cursor()
    c.execute("SELECT text, embedding FROM embeddings")
    results = c.fetchall()
    conn.close()
    results = [(text, pickle.loads(embedding)) for text, embedding in results]
    
    # calculate the cosine similarity
    similarities = []
    for text, embedding in results:
        query_embedding = np.array(query_embedding).reshape(1, -1)
        embedding = np.array(embedding).reshape(1, -1)
        
        # calculate the cosine similarity
        similarity = cosine_similarity(query_embedding, embedding)[0][0]
        similarities.append((text, embedding, similarity))
    similarities.sort(key=lambda x: x[2], reverse=True)
    # get the top 5 similar texts
    return similarities[:num_results]

In [None]:
def get_response(client, system_content="", assistant_content="", user_content="", model="gpt-3.5-turbo"):
    chat_completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_content},
            {"role": "assistant", "content": assistant_content},
            {"role": "user", "content": user_content}
        ],
        stream=True,
    )
    return chat_completion

In [None]:
def check_db_exists(database_name):
    conn = sqlite3.connect(database_name)
    c = conn.cursor()
    c.execute("SELECT count(name) FROM sqlite_master WHERE type='table' AND name='embeddings'")
    if c.fetchone()[0] == 0:
        return False
    else:
        return True

In [17]:
load_dotenv()

import fitz

# pdf_files = fitz.open("../data/data.pdf") 

raw_text = extract_text(pdf_files=["../data/data.pdf"])

chunks = get_chunks(text=raw_text)

client = OpenAI()

database_name = "smartRecipe.db"

# conversation = get_conversation_chain(vector_store=vector_store)

KeyboardInterrupt: 