In [1]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

from PyPDF2 import PdfReader

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
import os
from dotenv import load_dotenv

load_dotenv()

ASTRA_DB_APP_TOKEN = os.getenv('ASTRA_DB_APP_TOKEN')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
ASTRA_DB_ID = os.getenv('ASTRA_DB_ID')


In [14]:
# path of pdf files
pdfreader = PdfReader('data/prml_bishop.pdf')

In [15]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [18]:
# initializing the database
cassio.init(token=ASTRA_DB_APP_TOKEN, database_id=ASTRA_DB_ID)

In [19]:
# cearting LLM object
llm = OpenAI(openai_api_key=OPENAI_API_KEY)

# creating openai embedding object
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  warn_deprecated(
  warn_deprecated(


In [20]:
# Langchain vector store
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [21]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it should not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

Created a chunk of size 933, which is longer than the specified 800


In [24]:
texts[-1]

'validation set, 11, 32\nVapnik-Chervonenkis dimension, 344\nvariance, 20, 24, 149\nvariational inference, 315, 462, 635\nfor Gaussian mixture, 474\nfor hidden Markov model, 625\nlocal, 493\nVC dimension, seeVapnik-Chervonenkis dimen-\nsion\nvector quantization, 429vertex, seenode\nvisualization, 3\nViterbi algorithm, 415, 629\nvon Mises distribution, 108, 693\nwavelets, 139\nweak learner, 657weight decay, 10, 144, 257\nweight parameter, 227\nweight sharing, 268\nsoft, 269\nweight vector, 181\nweight-space symmetry, 231, 281\nweighted least squares, 668\nwell-determined parameters, 170whitening, 299, 568\nWishart distribution, 102, 693\nwithin-class covariance, 189Woodbury identity, 696\nwrapped distribution, 110\nYellowstone National Park, 110, 681'

In [31]:
# inserting top 100 headlines
astra_vector_store.add_texts(texts[:100])

print("Inserted %i headlines." % len(texts[:100]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 100 headlines.


In [32]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "explain in 200 words what is k means clustering"
ANSWER: "K means clustering is a popular unsupervised learning algorithm used in pattern recognition and machine learning. It is used to identify groups or clusters within a dataset based on their similarities. The goal of K means clustering is to find a predetermined number of clusters (represented by the letter "K") that can best represent the data. 

The algorithm works by first randomly selecting K points from the dataset as initial cluster centers. Then, each data point is assigned to the closest cluster center based on a distance measure, usually the Euclidean distance. The mean of the data points in each cluster is then calculated and becomes the new cluster center. This process is repeated until the cluster centers no longer change significantly, indicating that the algorithm has converged. 

The resulting clusters can be used for various purposes such as grouping similar data points together for further analysis or f