In [4]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

from datasets import load_dataset

import cassio

from PyPDF2 import PdfReader

In [2]:
ASTRA_DB_ID = ""
ASTRA_DB_API_TOKEN = ""
OPENAI_API_KEY = ""

In [5]:
pdfreader = PdfReader("maximum likelihood estimation.pdf")

In [6]:
from typing_extensions import Concatenate

In [7]:
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content


In [10]:
cassio.init(token=ASTRA_DB_API_TOKEN, database_id=ASTRA_DB_ID)

In [11]:
llm = OpenAI(api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

In [12]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mle",
    session=None,
    keyspace=None
)

In [15]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len
)

texts = text_splitter.split_text(raw_text)

In [19]:
astra_vector_store.add_texts(texts)
print(len(texts))
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

63


In [20]:
first_question = True
while True:
    if first_question:
        query_text = input("\n Enter your question").strip()
    else:
        query_text = input("\n What is your next question").strip()

    if query_text.lower() == "quit":
        break
    
    if query_text == "":
        continue

    first_question = False

    print(f"QUESTION: {query_text}")
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print(f"ANSWER: {answer}")

    print("FIRST DOC BY RELEVENCE")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print(f"{score}\n {doc.page_content[:20]}")

QUESTION: What is the goal of Logistic Regression?
ANSWER: The goal of Logistic Regression is to estimate the unknown parameters in the equation using maximum likelihood estimation, in order to find the set of parameters that maximizes the probability of the observed data.
FIRST DOC BY RELEVENCE
0.9242845942786552
 logi
1 i
=KX
k=0
0.9242845942786552
 logi
1 i
=KX
k=0
0.9232626795939753
 thatinwhichthelinear
0.9232626795939753
 thatinwhichthelinear
