#### Data loading

In [4]:
## from text document
from langchain_community.document_loaders import TextLoader

loader = TextLoader('./roman_empire_brief_history.txt', encoding='utf-8')
text_documents = loader.load()
text_documents 


[Document(page_content='The history of the Roman Empire can be divided into three distinct periods: The Period of Kings (625-510 BC), Republican Rome (510-31 BC), and Imperial Rome (31 BC – AD 476).\n\nFounding (c. 625 BC)\nRome was founded around 625 BC in the areas of ancient Italy known as Etruria and Latium. It is thought that the city-state of Rome was initially formed by Latium villagers joining together with settlers from the surrounding hills in response to an Etruscan invasion. It is unclear whether they came together in defense or as a result of being brought under Etruscan rule. Archaeological evidence indicates that a great deal of change and unification took place around 600 BC which likely led to the establishment of Rome as a true city.\n\nPeriod of Kings (625-510 BC)\nThe first period in Roman history is known as the Period of Kings, and it lasted from Rome’s founding until 510 BC. During this brief time Rome, led by no fewer than six kings, advanced both militaristical

In [6]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [16]:
## web based format
from langchain_community.document_loaders import WebBaseLoader
import bs4

## load - chunk - index the content of html
loader = WebBaseLoader(web_paths = ("https://www.britannica.com/place/Ottoman-Empire",) ,
                        bs_kwargs = dict(parse_only = bs4.SoupStrainer(
                            class_ = ("topic-paragraph", "h1", "h2")
                        )))

web_documents = loader.load()
web_documents

[Document(page_content='Ottoman Empire,  empire created by Turkish tribes in Anatolia (Asia Minor) that grew to be one of the most powerful states in the world during the 15th and 16th centuries. The Ottoman period spanned more than 600 years and came to an end only in 1922, when it was replaced by the Turkish Republic and various successor states in southeastern Europe and the Middle East. At its height the empire encompassed most of southeastern Europe to the gates of Vienna, including present-day Hungary, the Balkan region, Greece, and parts of Ukraine; portions of the Middle East now occupied by Iraq, Syria, Israel, and Egypt; North Africa as far west as Algeria; and large parts of the Arabian Peninsula. The term Ottoman is a dynastic appellation derived from Osman I (Arabic: ʿUthmān), the nomadic Turkmen chief who founded both the dynasty and the empire about 1300.The Ottoman state to 1481: the age of expansionThe first period of Ottoman history was characterized by almost continu

In [1]:
## from pdf
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("paper.pdf")
pdf_documents = loader.load()
# pdf_documents

#### chunks

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
documents = text_splitter.split_documents(pdf_documents)
# documents

### vector embeddings

In [3]:
## Vector embeddings and vector store
# from langchain_community.embeddings import HuggingFaceEmbeddings # -- deprecated?
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

# embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embedding = SentenceTransformerEmbeddings(model_name = "thenlper/gte-small")
db_chroma = Chroma.from_documents(documents, embedding)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [8]:
## vector database -- Chroma
query = "Who is the author of this paper?"
result = db_chroma.similarity_search(query)
result[0].page_content

In [7]:
## vector database -- FAISS
from langchain_community.vectorstores import FAISS
db_faiss = FAISS.from_documents(documents, embedding)

result = db_faiss.similarity_search(query)
result[0].page_content