In [1]:
%load_ext dotenv
%dotenv

In [2]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import Docx2txtLoader
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter

In [3]:
loader = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")

In [4]:
whole_doc = loader.load()

In [5]:
whole_doc[0].page_content[:1000]

'# Data Analytics Course\n\n## Analysis vs Analytics\n\nAnalysis vs Analytics\n\nAlright! So…\nLet’s discuss the not-so-obvious differences\nbetween the terms analysis and analytics.\nDue to the similarity of the words, some people\nbelieve they share the same meaning, and thus\nuse them interchangeably. Technically, this\nisn’t correct. There is, in fact, a distinct\ndifference between the two. And the reason\nfor one often being used instead of the other\nis the lack of a transparent understanding\nof both.\nSo, let’s clear this up, shall we?\nFirst, we will start with analysis.\nConsider the following…\nYou have a huge dataset containing data of\nvarious types. Instead of tackling the entire\ndataset and running the risk of becoming overwhelmed,\nyou separate it into easier to digest chunks\nand study them individually and examine how\nthey relate to other parts. And that’s analysis\nin a nutshell.\nOne important thing to remember, however,\nis that you perform analyses on things th

In [6]:
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[('#','course title'),('##','module title')])

In [7]:
md_doc = md_splitter.split_text(whole_doc[0].page_content)

In [8]:
for i in md_doc:
    i.page_content = ' '.join(i.page_content.split())

In [9]:
char_splitter = CharacterTextSplitter(separator='.', chunk_size=500, chunk_overlap=50)

In [10]:
md_char_doc = char_splitter.split_documents(md_doc)

In [11]:
md_char_doc

[Document(metadata={'course title': 'Data Analytics Course', 'module title': 'Analysis vs Analytics'}, page_content='Analysis vs Analytics Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both'),
 Document(metadata={'course title': 'Data Analytics Course', 'module title': 'Analysis vs Analytics'}, page_content='So, let’s clear this up, shall we? First, we will start with analysis. Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you separate it into easier to digest chunks and study them individually and exa

In [12]:
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')

In [30]:
vector_db_local = Chroma(persist_directory='./vector_db', embedding_function=embedding)

In [34]:
question = 'What are the popular programming languages used by Data Scientists?'

In [36]:
similarity_search = vector_db_local.similarity_search(query=question,k=5)

In [38]:
similarity_search

[Document(id='1c9cdcd3-f066-4702-8aa3-b810795a72ae', metadata={'course title': 'Data Analytics Course', 'module title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='What about big data? Apart from R and Python, people working in this area are often proficient in other languages like Java or Scala. These two have not been developed specifically for doing statistical analyses, however they turn out to be very useful when combining data from multiple sources. All right! Let’s finish off with machine learning. When it comes to machine learning, we often deal with big data'),
 Document(id='fee6d2f3-7028-4ba6-8112-31cb32abc434', metadata={'course title': 'Data Analytics Course', 'module title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can ma

In [40]:
question2 = 'What software do data scientists use?'

In [54]:
mmr_search = vector_db_local.max_marginal_relevance_search(query=question2, k=3, lambda_mult=0.5, filter={'module title':'Programming Languages & Software Employed in Data Science - All the Tools You Need'})

In [55]:
mmr_search

[Document(id='fee6d2f3-7028-4ba6-8112-31cb32abc434', metadata={'course title': 'Data Analytics Course', 'module title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end'),
 Document(id='d7780371-ba9d-412d-941c-473e01924b40', metadata={'course title': 'Data Analytics Course', 'module title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='Among the many applications we have plotted, we can say there is an increasing amount of software designed

In [58]:
retriver = vector_db_local.as_retriever(search_type='mmr',search_kwargs={'k':3,'lambda_mult':0.7})

In [60]:
retriver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x121d87500>, search_type='mmr', search_kwargs={'k': 3, 'lambda_mult': 0.7})

In [62]:
retrived_documents = retriver.invoke(question2)

In [64]:
retrived_documents

[Document(id='fee6d2f3-7028-4ba6-8112-31cb32abc434', metadata={'course title': 'Data Analytics Course', 'module title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end'),
 Document(id='9b3ee7ae-0c07-49eb-aea5-f5b755e61f28', metadata={'course title': 'Data Analytics Course', 'module title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='It’s actually a software framework which was designed to address the complexity of big data and its comput