## Install Langchain

In [4]:
!pip install langchain --upgrade --quiet
!pip install unstructured pdf2image --quiet
!pip install openai --quiet

## Import PDF Loaders

In [31]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

## Load the pdf from an online location

In [120]:
#If using on your workspace remember to add a firewall exception for https://wolfpaulus.com
loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

### Load the data into a variable

In [34]:
data = loader.load()

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


### Check to see how many documents and chars are in the book

In [35]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 72807 characters in your document


### Now let's split the data into chunks before we turn them into embeddings

In [37]:
# Split the data

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [39]:
#Check how many documents we have now
print (f'Now you have {len(texts)} documents')

Now you have 40 documents


## Let's convert the chunks into embeddings and then store it in a SingleStore table

### Let's create the table first

In [47]:
#replace winter_wikipedia with your database name
%%sql
USE winter_wikipedia;
CREATE TABLE IF NOT EXISTS my_book (
    id INT PRIMARY KEY,
    text TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
    embedding BLOB
);

[]

In [41]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YOU_KEY')

In [None]:
#embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [42]:
print(texts[0].__dict__)

{'page_content': 'Everyone you will ever meet knows something you don’t.\n\n[1]\n\n››\n\nT H E S TO RY\n\nof T H E F I E L D\n\nG U I D E\n\nSeveral years ago we created The Field Guide to Data Science because we wanted to help organizations of all types and sizes. There were countless industry and academic publications describing what Data Science is and why we should care, but very little information was available to explain how to make use of data as a resource. We find that situation to be just as true today as we did two years ago, when we created the first edition of the field guide.\n\nAt Booz Allen Hamilton, we built an industry-leading team of Data Scientists. Over the course of hundreds of analytic challenges for countless clients, we’ve unraveled the DNA of Data Science. Many people have put forth their thoughts on single aspects of Data Science. We believe we can offer a broad perspective on the conceptual models, tradecraft, processes and culture of Data Science – the what

In [43]:
from sqlalchemy import *

db_connection = create_engine(connection_url)

In [44]:
!pip install tiktoken --quiet

### Now let's add the embeddings to the my_book table. Truncate to make sure we don't overwrite

In [46]:
from langchain.embeddings import OpenAIEmbeddings
import pymysql

# Initialize the OpenAIEmbeddings
embedder = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Connect to your SingleStore database
db = pymysql.connect(
    host="your_host_info_from_singlestore",
    port=3306,
    user="admin",
    password="your_password",
    database="your_database_name"
)

# Create a cursor object
cursor = db.cursor()

# Clear the my_book table
cursor.execute("TRUNCATE TABLE my_book")

# Iterate over the texts
for i, document in enumerate(texts):
    # Extract the text content from the Document
    text_content = document.page_content

    # Convert the text to embeddings
    embedding = embedder.embed_documents([text_content])[0]
    
    # Convert the embedding to a string
    embedding_str = ','.join(map(str, embedding))

    # Encode the string as bytes
    embedding_bytes = embedding_str.encode('utf-8')

    # Insert the text and its embedding into the database
    query = """
    INSERT INTO my_book (id, text, embedding)
    VALUES (%s, %s, %s)
    """
    cursor.execute(query, (i, text_content, embedding_bytes))

# Commit the transaction
db.commit()

# Close the database connection
db.close()


In [52]:
!pip install matplotlib --quiet
!pip install scipy --quiet
!pip install scikit-learn --quiet

## Do the DOT_PRODUCT in SingleStore instead and JOIN with other data if needed

In [111]:
from openai.embeddings_utils import get_embedding
import json

# Your query text
query_text = "What is the collect stage of data maturity"

# Convert the query text to embeddings
query_embedding = embedder.embed_documents([query_text])[0]

# Convert the query embedding to a string
query_embedding_str = ','.join(map(str, query_embedding))

# Encode the string as bytes
#query_embedding_bytes = query_embedding_str.encode('utf-8')

# Perform a similarity search against the embeddings
query = f"""
SELECT id, text, DOT_PRODUCT_F64(JSON_ARRAY_PACK_F64(%s), embedding) as similarity
FROM my_book
ORDER BY similarity DESC
LIMIT 1
"""

#results = engine.execute(text(query)).fetchall()

result = engine.execute(query, query_embedding_str).fetchone()

# Pass the result as a prompt to Chat GPT
prompt = f"The user asked: {query_text}. The most similar text from the book is: {result[0]}"

#prompt = f"The user asked: {query_text}. The most similar text from the book is: {result['text']}"
response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}])

# Print the assistant's response
print(response['choices'][0]['message']['content'])


The collect stage of data maturity refers to the initial phase of data management where an organization gathers data from various sources and stores it in a centralized location. This stage is characterized by ad-hoc and unstructured data collection and limited data analysis.
