# Part 2: Indexing

In [None]:
import bs4,os
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50)

text = text_splitter.split_documents(blog_docs)

In [None]:
len(text)

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(text[10].page_content, "cl100k_base")

In [None]:
question = "What is overfitting?"

documents = ["""Overfitting happens when a machine learning model learns the training data too well, including noise or random fluctuations that are specific to the training set but don't generalize well to new, unseen data.""",
             
             "To reduce overfitting, we should use some regularization techniques and perform cross-validation to find the best hyperparameters. Additionally, reducing overfitting is not just about randomly trying different methods; it requires a thoughtful approach. Developing an understanding of when and how to use different techniques will save time and effort and improve your expertise."]

In [None]:
from langchain_cohere import CohereEmbeddings
embeddings = CohereEmbeddings(model="embed-english-light-v3.0",cohere_api_key="Enter api key here.")
query_result = embeddings.embed_query(question)
documents_result = embeddings.embed_documents(documents)

In [None]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

score_dict = {}
for idx, doc_vec in enumerate(documents_result):
    score_dict[documents[idx]] = cosine_similarity(query_result, doc_vec)

print("Question Cosine Similarity with different vectors:\n")
print(f"Question: {question}\n")

for doc, score in score_dict.items():
    print(f"{doc[:40]}...: {score:.4f}")

In [None]:
in_1 = """I love Python."""

in_2 = """I love JavaScript."""

in_3 = """I love Palestine.""" 

in_4 = """Islam is true religious.""" 

in_5 = """Islam is true and world largest religious.""" 

question = """Which religious is true."""


input_text_lst_sim = [in_1, in_2, in_3, in_4,in_5,question]

_embeddings = []
for chunk in input_text_lst_sim:
    _embeddings.append(embeddings.embed_query(chunk))

print(_embeddings)

In [None]:
import matplotlib.pyplot as plt

# Extract the first two dimensions for plotting (assuming embeddings have more than 2 dimensions)
embeddings_2d = [embedding[:2] for embedding in _embeddings]

# Create the plot
plt.figure(figsize=(6, 4))
for i, text in enumerate(input_text_lst_sim):
  x, y = embeddings_2d[i]
  plt.scatter(x, y)
  plt.annotate(text, (x, y), textcoords="offset points", xytext=(0, -10),fontsize=8)
  
plt.title("2D Visualization of Embeddings")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.grid(True)
plt.show()

In [None]:
from langchain_chroma import Chroma

db = Chroma.from_texts(persist_directory="./chroma_db8",embedding=embeddings,texts=text)