## Data 

1. Data load
2. Split Data - Recursive text split & Character text
3. Embed text -> Embedding
4. Vector database
5. Q -> Query -> Similarity Search & as_retriever

In [18]:
import os

In [4]:
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage, SystemMessage

llm_model = ChatGroq(
    groq_api_key=os.getenv("GROQ_API_KEY"),
    model_name="llama3-70b-8192"
)

NameError: name 'os' is not defined

In [48]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./data/good.txt")
loaded_data = loader.load()


In [49]:
loaded_data

[Document(metadata={'source': './data/good.txt'}, page_content='Hi I am a good girl.\n\njack and jill went up to hill\n\nhow are you')]

## Character Splitter

In [50]:
from langchain_text_splitters import CharacterTextSplitter

In [51]:
text_splitter = CharacterTextSplitter(
    separator="\n\n",         # Use single newline if that's what your text uses
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False
)


In [56]:
texts = text_splitter.create_documents([loaded_data[0].page_content])


In [57]:
# Print chunks
for i, doc in enumerate(texts):
    print(f"Chunk {i+1}:\n{doc.page_content}\n")

Chunk 1:
Hi I am a good girl.

jack and jill went up to hill

how are you



In [55]:
print(repr(loaded_data[0].page_content[:1000]))


'Hi I am a good girl.\n\njack and jill went up to hill\n\nhow are you'


In [58]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [59]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 26,
    chunk_overlap = 4
)



In [60]:
texts = recursive_splitter.create_documents([loaded_data[0].page_content])


In [61]:
# Print chunks
for i, doc in enumerate(texts):
    print(f"Chunk {i+1}:\n{doc.page_content}\n")

Chunk 1:
Hi I am a good girl.

Chunk 2:
jack and jill went up to

Chunk 3:
to hill

Chunk 4:
how are you



## Embeddings

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chunks_of_text = ["Hi There!", "Hello!", "What's your name? ", "Bond, James Bond", "Hello, Bond!"]

In [None]:

embeddings = embedding.embed_documents(chunks_of_text)

In [12]:
len(embeddings[4])

384

## Vector Database

### Chroma

In [13]:
from langchain_community.document_loaders import TextLoader

In [47]:
from langchain_text_splitters import CharacterTextSplitter
from langchain.vectorstores import Chroma


In [17]:
import os
from langchain_groq import ChatGroq

llm_model = ChatGroq(
    groq_api_key=os.getenv("GROQ_API_KEY"),
    model_name="llama3-70b-8192"
)

In [48]:
loaded_docs = TextLoader('./data/good.txt').load()

In [49]:
splitter = CharacterTextSplitter(separator="\n", chunk_size=100, chunk_overlap = 10)

In [50]:
chunks_of_text = splitter.split_documents(loaded_docs)
len(chunks_of_text)

6

In [51]:
vector_db = Chroma.from_documents(
    chunks_of_text,
    embedding,
    persist_directory="chroma_db"
)
vector_db.persist()  # Save the DB (optional but good practice)

  vector_db.persist()  # Save the DB (optional but good practice)


In [71]:
question = "Which city is good for beaches?"
results = vector_db.similarity_search_with_score(question, k=3)

In [72]:
results

[(Document(metadata={'source': './data/good.txt'}, page_content='Bali has stunning beaches, water temples, and peaceful yoga retreats.'),
  1.0811179876327515),
 (Document(metadata={'source': './data/good.txt'}, page_content='Tokyo offers cherry blossoms, sushi, and river cruises blending tradition and modernity.'),
  1.3551419973373413),
 (Document(metadata={'source': './data/good.txt'}, page_content='Paris is known for the Eiffel Tower and romantic boat cruises on the Seine River.'),
  1.4880256652832031)]

### FAISS

In [2]:
from langchain_community.vectorstores import FAISS

In [3]:
from langchain_text_splitters import CharacterTextSplitter


In [1]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./data/good.txt")
loaded_data = loader.load()


In [22]:
text_splitter =     CharacterTextSplitter(separator="\n", chunk_size = 100, chunk_overlap = 10)
chunks_of_text = text_splitter.split_documents(loaded_data)
print(len(chunks_of_text))

6


In [31]:


vector_db = FAISS.from_documents(
    chunks_of_text,
    embedding,
)
vector_db.save_local("faiss_db")  # Save the DB (optional but good practice)

In [39]:
retriever = vector_db.as_retriever(
    search_type="similarity",  # or "mmr"
    search_kwargs={
        "k": 1,
        # for mmr
        # "lambda_mult": 0.5,
        # "fetch_k": 20,
    }
    
)

In [40]:
response = retriever.invoke("which city has most delicious cuisine?")

In [41]:
print(response)

[Document(id='51e8de39-454f-4f87-9761-9d230924ecd5', metadata={'source': './data/good.txt'}, page_content='Rome is famous for the Colosseum and authentic Italian cuisine.')]


In [57]:
texts = ["AI is the future", "Cats are cute", "Generative Models create content"]
metadatas = [{"category": "tech"}, {"category" : "pets"}, {"category": "tech"}]

In [58]:

vector_store = Chroma.from_texts(texts, embedding=embedding, metadatas=metadatas)

In [64]:
retriever = vector_store.as_retriever(
    search_kwargs = {"k":2,
                     "filter": {"category":"tech"}
                     }
)

In [67]:
results = retriever.get_relevant_documents("tell me about models")  

In [69]:
for doc in results:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}")


Content: Generative Models create content, Metadata: {'category': 'tech'}
Content: Generative Models create content, Metadata: {'category': 'tech'}
