# Indexing RAG (Retrival Augmented Generation)

In [5]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [6]:
from langchain_community.document_loaders import PyPDFLoader
import copy

In [7]:
loader_pdf = PyPDFLoader("../resources/Introduction_to_Data_and_Data_Science.pdf")

In [8]:
pages_pdf = loader_pdf.load()

In [9]:
pages_pdf[5]

Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2023-11-09T10:16:34+02:00', 'author': 'Hristina  Hristova', 'moddate': '2023-11-09T10:16:34+02:00', 'source': '../resources/Introduction_to_Data_and_Data_Science.pdf', 'total_pages': 6, 'page': 5, 'page_label': '6'}, page_content='We hope we gave you a good idea about the \nlevel of applicability of the most frequently \nused programming and software tools in the \nfield of data science. \nThank you for watching!')

In [10]:
pages_pdf_cut = copy.deepcopy(pages_pdf)

In [11]:
for i in pages_pdf_cut:
    i.page_content = " ".join(i.page_content.split())

In [12]:
pages_pdf_cut

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2023-11-09T10:16:34+02:00', 'author': 'Hristina  Hristova', 'moddate': '2023-11-09T10:16:34+02:00', 'source': '../resources/Introduction_to_Data_and_Data_Science.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1'}, page_content='Analysis vs Analytics Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis. Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire datas

## Indexing DOCX file

In [13]:
from langchain_community.document_loaders import Docx2txtLoader

In [14]:
loader_docx = Docx2txtLoader("../resources/Introduction_to_Data_and_Data_Science.docx")

In [15]:
pages_docx = loader_docx.load()

## Document Splitting

In [16]:
from langchain_text_splitters.character import CharacterTextSplitter

In [17]:
for i in range(len(pages_docx)):
    pages_docx[i].page_content = " ".join(pages_docx[i].page_content.split())


In [18]:
pages_docx[0].page_content

"Analysis vs Analytics Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis. Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you separate it into easier to digest chunks and study them individually and examine how they relate to other parts. And that’s analysis in a nutshell. One important thing to remember, however, is that you perform analyses on things that have already happened in the past. Such as using an analysis to explain how a

In [19]:
char_splitter = CharacterTextSplitter(separator=".", chunk_size=500, chunk_overlap=50)

In [20]:
pages_char_split = char_splitter.split_documents(pages_docx)

In [21]:
len(pages_char_split)

21

In [22]:
pages_char_split[0].page_content

'Analysis vs Analytics Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both'

In [23]:
pages_char_split[1].page_content

'So, let’s clear this up, shall we? First, we will start with analysis. Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you separate it into easier to digest chunks and study them individually and examine how they relate to other parts. And that’s analysis in a nutshell'

## Document Splitting with Markdown Header Text Splitter

In [24]:
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter

In [25]:
pages_docx

[Document(metadata={'source': '../resources/Introduction_to_Data_and_Data_Science.docx'}, page_content="Analysis vs Analytics Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis. Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you separate it into easier to digest chunks and study them individually and examine how they relate to other parts. And that’s analysis in a nutshell. One important thing to remember, however, is that you perfor

In [26]:
loader_docx = Docx2txtLoader("../resources/Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

In [27]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("#", 'Course Title'), ("##", "Lecture Title")])

In [28]:
pages_md_split = markdown_splitter.split_text(pages[0].page_content)

In [29]:
pages_md_split

[Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content="Alright! So…\nLet’s discuss the not-so-obvious differences\nbetween the terms analysis and analytics.\nDue to the similarity of the words, some people\nbelieve they share the same meaning, and thus\nuse them interchangeably. Technically, this\nisn’t correct. There is, in fact, a distinct\ndifference between the two. And the reason\nfor one often being used instead of the other\nis the lack of a transparent understanding\nof both.\nSo, let’s clear this up, shall we?\nFirst, we will start with analysis.\nConsider the following…\nYou have a huge dataset containing data of\nvarious types. Instead of tackling the entire\ndataset and running the risk of becoming overwhelmed,\nyou separate it into easier to digest chunks\nand study them individually and examine how\nthey relate to other parts. And that’s analysis\nin a nutshell.\nOne important thing to remember

## Text Embedding

In [30]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np

In [31]:
loader_docx = Docx2txtLoader("../resources/Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Course Title"), 
                           ("##", "Lecture Title")]
)

pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())
    
char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap  = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

In [32]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [33]:
vector1 = embeddings.embed_query(pages_char_split[3].page_content)
vector2 = embeddings.embed_query(pages_char_split[5].page_content)
vector3 = embeddings.embed_query(pages_char_split[18].page_content)

In [34]:
vector1

[0.0037478648591786623,
 0.010548347607254982,
 0.009972748346626759,
 -0.03044857084751129,
 -0.019738538190722466,
 0.012106993235647678,
 -0.024239856749773026,
 -0.012947756797075272,
 0.01048367377370596,
 -0.027111386880278587,
 0.00476971548050642,
 0.01667298562824726,
 -0.013167648576200008,
 0.00635099783539772,
 -0.010923457331955433,
 -0.016608310863375664,
 0.036139894276857376,
 -0.0009046291233971715,
 0.020217126235365868,
 -0.023631921038031578,
 -0.04369382932782173,
 0.022584199905395508,
 -0.008698668330907822,
 -0.027447693049907684,
 -0.009972748346626759,
 0.0038060713559389114,
 0.010296118445694447,
 -0.02492540329694748,
 0.0049443356692790985,
 -0.01827690377831459,
 0.009836932644248009,
 0.005801267921924591,
 -0.007929045706987381,
 0.0005254771676845849,
 -0.009060842916369438,
 0.002866680035367608,
 0.011214490979909897,
 0.01587102748453617,
 0.015327764675021172,
 0.017643097788095474,
 0.015224285423755646,
 0.000934540934395045,
 -0.0340444520115852

In [35]:
len(vector1), len(vector2),len(vector3)

(1536, 1536, 1536)

In [36]:
np.dot(vector1, vector2)

0.8792246875132874

In [37]:
np.dot(vector1, vector3)

0.800001028609988

In [38]:
np.dot(vector2, vector3)

0.7936194098992386

In [39]:
np.linalg.norm(vector3)

1.0000000544512242

## Creating Chroma Vectorstore

In [40]:
from langchain_community.vectorstores import Chroma

In [41]:
vectorstore = Chroma.from_documents(documents=pages_char_split, embedding=embeddings, persist_directory="./test")

In [42]:
vectorstore_from_directory = Chroma(persist_directory="./test", embedding_function=embeddings)

  vectorstore_from_directory = Chroma(persist_directory="./test", embedding_function=embeddings)


## Inspecting and Managing documents in a Vectorstore

In [45]:
from langchain_core.documents import Document

In [47]:
vectorstore_from_directory.get(ids="49cfd0e2-b712-477b-b89b-93b956db667a",include=["embeddings"])

{'ids': ['49cfd0e2-b712-477b-b89b-93b956db667a'],
 'embeddings': array([[-0.00145079,  0.00294724,  0.04136246, ...,  0.00858565,
         -0.02052466, -0.00128198]]),
 'documents': None,
 'uris': None,
 'included': ['embeddings'],
 'data': None,
 'metadatas': None}

In [48]:
added_document = Document(page_content='Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis', 
                          metadata={'Course Title': 'Introduction to Data and Data Science', 
                                    'Lecture Title': 'Analysis vs Analytics'})

In [49]:
vectorstore_from_directory.add_documents([added_document])

['f3351060-fe7c-44f5-a0d4-3166ecf43f4a']

In [52]:
vectorstore_from_directory.get("f3351060-fe7c-44f5-a0d4-3166ecf43f4a")

{'ids': ['f3351060-fe7c-44f5-a0d4-3166ecf43f4a'],
 'embeddings': None,
 'documents': ['Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Course Title': 'Introduction to Data and Data Science',
   'Lecture Title': 'Analysis vs Analytics'}]}

In [53]:
updated_document = Document(page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!', 
                            metadata={'Course Title': 'Introduction to Data and Data Science', 
                                     'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'})

In [54]:
vectorstore_from_directory.update_document(document_id = "f3351060-fe7c-44f5-a0d4-3166ecf43f4a", 
                                           document = updated_document)

In [55]:
updated_document

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!')

In [56]:
vectorstore_from_directory.delete("f3351060-fe7c-44f5-a0d4-3166ecf43f4a")

In [57]:
vectorstore_from_directory.get("f3351060-fe7c-44f5-a0d4-3166ecf43f4a")

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}