In [39]:
# first of all we need to ingest a data i.e. data ingestion for ingestion:

from langchain_astradb import AstraDBVectorStore
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os
import pandas as pd

In [40]:
# it will load the .env file and extract the API key and to acess that one we have to write once code here:
load_dotenv()

True

In [41]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [42]:
# now lets save this API key inside my environment
os.environ["OPENAI_API_KE"] = OPENAI_API_KEY

In [43]:
# define Embedding model
# Embeddings Model Initialization: An instance of an embeddings model (SomeEmbeddingModel) is created. This model is responsible for converting text documents into vectors.
embedding = OpenAIEmbeddings()

In [44]:
# now we have to ingest the data and we will ingest this data inside vector database. here we will use Astra db vector database

In [45]:
# Datastax Astra: A fully managed cloud service that simplifies deploying, managing, and scaling Cassandra in the cloud. Astra reduces the operational complexity associated with managing Cassandra 
# and provides flexible deployment options across multiple cloud platform.
# Datastax: DataStax is a technology company best known for its commercial development of Apache Cassandra, an open-source NoSQL database designed to handle large amounts of data across many servers.
# Astra : Astra is cloud version of Cassandra.

In [46]:
# So to store the vectors (data) in database Astra which is cloud version of Cassandra, we need to configure 3 things : API Endpoint, Application Tokens and Namespace.
# visit here to get the details : https://astra.datastax.com/org/acb8cbde-e1dc-49bc-b465-08ecaab743de/database/2776b246-bb5f-4fe0-8332-f620e00ff6d6/summary 

In [None]:
# Create your own keys for both ASTRA_DB_API_ENDPOINT and ASTRA_DB_APPLICATION_TOKEN

In [47]:
ASTRA_DB_API_ENDPOINT = "https://2776b246-bb5f-4fe0-8332-f620e00ff6d6-us-east-2.apps.astra.datastax.com"

In [48]:
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:ABmzyRRdTTDzpguOyrAIOiHj:a1ddf2fd32b258e285d8dac99b363a2409aa6da56533c97369d8d006ffe3aef7"

In [49]:
ASTRA_DB_KEYSPACE = "default_keyspace"

In [50]:
api_endpoint = ASTRA_DB_API_ENDPOINT,
toekn = ASTRA_DB_APPLICATION_TOKEN,
namespace = ASTRA_DB_KEYSPACE

In [51]:
# now I require collection name where my data will be available
collection_name = "financebot"

In [52]:
# now we have to ingest the data and we will ingest this data inside vector database. here we will use Astra db vector database
vstore =  AstraDBVectorStore(
    embedding=embedding,
    collection_name=collection_name,
    api_endpoint = ASTRA_DB_API_ENDPOINT,
    token = ASTRA_DB_APPLICATION_TOKEN,
    namespace = ASTRA_DB_KEYSPACE
)

In [53]:
# We are able to create our vector store above

In [54]:
# Now I will load data from pdf for that I required pypdf
from langchain_community.document_loaders import PyPDFLoader

In [55]:
loader = PyPDFLoader("D:\\Industry prep\\Gen AI\\End to End Gen AI\\Finance-Chatbot\\data\\finance_data.pdf")

In [56]:
pages = loader.load()

In [61]:
len(pages)

108

In [63]:
# since this data is very very huge as it has 108 pages, I am going to slice the page that is will take only few pages
pages = pages[10:20]

In [65]:
pages[0].page_content

'Table of Contents \n9 understand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster \nlearning cycle for speci fic process modules. \n\uf0a7 Our Scribe CV test chips are inserted directly on customers’ pr oduct wafers to collect data about critical \nlayers. \n\uf0a7 Our DirectProbe™ CV test chips are designed to enable ultra-fas t yield learning for new product designs \nby allowing our customers to measure components of actual produ ct layout and identify yield issues. \n• pdFasTest ® Electrical Tester – Our proprietary electrical test hardware is optimized to quickl y test our CV test \nchips, enabling fast defect and p arametric characterization of manufacturing processes. As part of the system \noffering, we provide test progr ams for each CV test chip that a re tuned to the customer’s process. This automated \nsystem provides parallel functional testing, thus minimizing th e time required to perform millions of electrical \nmeasurements

In [66]:
# Now I will divide this entire data into chunks and for that I will import RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [67]:
# I will call the splitter object splitter_ques_gen to split the text
text_splitter = RecursiveCharacterTextSplitter(   # this will give me chunks
    chunk_size = 500,   
    chunk_overlap = 100    
)

In [69]:
for i, doc in enumerate(pages):
    print(i, doc)

0 page_content='Table of Contents \n9 understand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster \nlearning cycle for speci fic process modules. \n\uf0a7 Our Scribe CV test chips are inserted directly on customers’ pr oduct wafers to collect data about critical \nlayers. \n\uf0a7 Our DirectProbe™ CV test chips are designed to enable ultra-fas t yield learning for new product designs \nby allowing our customers to measure components of actual produ ct layout and identify yield issues. \n• pdFasTest ® Electrical Tester – Our proprietary electrical test hardware is optimized to quickl y test our CV test \nchips, enabling fast defect and p arametric characterization of manufacturing processes. As part of the system \noffering, we provide test progr ams for each CV test chip that a re tuned to the customer’s process. This automated \nsystem provides parallel functional testing, thus minimizing th e time required to perform millions of electrical

In [70]:
# we have page content separetely. Now instead of taking pages separetely, I will store everything in one variable.
# I will take empty variable
# I will collect the content of the all pages
raw_text = ''
for i, doc in enumerate(pages): # enumerate means it will give indexes as well
    text = doc.page_content
    if text:
        raw_text += text

In [71]:
print(raw_text)

Table of Contents 
9 understand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster 
learning cycle for speci fic process modules. 
 Our Scribe CV test chips are inserted directly on customers’ pr oduct wafers to collect data about critical 
layers. 
 Our DirectProbe™ CV test chips are designed to enable ultra-fas t yield learning for new product designs 
by allowing our customers to measure components of actual produ ct layout and identify yield issues. 
• pdFasTest ® Electrical Tester – Our proprietary electrical test hardware is optimized to quickl y test our CV test 
chips, enabling fast defect and p arametric characterization of manufacturing processes. As part of the system 
offering, we provide test progr ams for each CV test chip that a re tuned to the customer’s process. This automated 
system provides parallel functional testing, thus minimizing th e time required to perform millions of electrical 
measurements to test our CV test c

In [73]:
# text_splitter.split_text(raw_text)
texts = text_splitter.split_text(raw_text)

In [74]:
len(texts)

91

In [75]:
texts[0]

'Table of Contents \n9 understand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster \nlearning cycle for speci fic process modules. \n\uf0a7 Our Scribe CV test chips are inserted directly on customers’ pr oduct wafers to collect data about critical \nlayers. \n\uf0a7 Our DirectProbe™ CV test chips are designed to enable ultra-fas t yield learning for new product designs'

In [77]:
type(texts)

list

In [78]:
type(texts[0])

str

In [79]:
# Now lets convert this str format to document format because Embedding model support Document format
from langchain.docstore.document import Document

In [80]:
# each text i am getting is converting to docuement
docs = []
for i in range(len(texts)):
    doc = Document(page_content=texts[i])
    docs.append(doc)

In [81]:
docs

[Document(page_content='Table of Contents \n9 understand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster \nlearning cycle for speci fic process modules. \n\uf0a7 Our Scribe CV test chips are inserted directly on customers’ pr oduct wafers to collect data about critical \nlayers. \n\uf0a7 Our DirectProbe™ CV test chips are designed to enable ultra-fas t yield learning for new product designs'),
 Document(page_content='by allowing our customers to measure components of actual produ ct layout and identify yield issues. \n• pdFasTest ® Electrical Tester – Our proprietary electrical test hardware is optimized to quickl y test our CV test \nchips, enabling fast defect and p arametric characterization of manufacturing processes. As part of the system \noffering, we provide test progr ams for each CV test chip that a re tuned to the customer’s process. This automated'),
 Document(page_content='system provides parallel functional testing, thus minim

In [82]:
# Now I will store or load my data inside the vector store I have created above
vstore.add_documents(docs)

['3a01a63df6534924a7f9fe5c81bf863d',
 '01a3b63fa37b4d81802c6fcc747810f8',
 '46b68798c0d94a25b3ab802130dbaa25',
 'f8f21204d8804cf4bd1bef8c90d9a642',
 '487371eddf654338821b91462447af3b',
 '148acb247a9d4066bce08a1b3ed8f098',
 '991d507488b942328825ad122a0f3cd5',
 'a76dc4645dda4200a757847d2ed4d026',
 '3dbabc82740d425bb1a99e0811560f66',
 '7a3879499af3442da22baf3ec8da19ee',
 '539bcda4f98544e591df616fa4629747',
 '72ef686061a54401afb05a058d07b6de',
 '551177fc5b62423fb2ff6950d5eb7496',
 '88f0eb518fc14a63b4013bfe8102bdcd',
 '99baab853e934b7c865189e32ab127d6',
 '2d25a6905ce64aadb3f2c6fe3acd6b64',
 'd96f12fa3cc9441b978bc1eb313e7498',
 '63ae920a5cc24b53ada8a86551cb7ded',
 '69a35bf2af7844f4b7a57659b1ed807e',
 '4f8f1063a3064490af7563739b373631',
 'fff83b1eea274b8192bed2fea19db3d0',
 'c535561fd5c54f35af9d5a11303e6b4a',
 '2ca91b78912b4b1ca6c82cda353f8b23',
 'fec5ef184c0e40148c3e7bf71eacce44',
 'be2d61816f5c4903a3629752d8469736',
 'a2d80bd0a99f4fbbae29f7bf14068d13',
 '6862324c99a549e08a4b166ec1a94af7',
 

### **Step 1: PDF data Extraction - PDF doc ---> Extracted doc**