In [None]:
%pip install langchain langchain_openai langchain_elasticsearch faiss-cpu langchain-community --upgrade

In [2]:
import os

openai_api_key = os.getenv("OPENAI_API_KEY")

In [156]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS

In [157]:
raw_text = """
Strength training, also known as resistance training, involves using weights or other forms of resistance to build muscle strength and endurance. This type of training is essential for anyone looking to improve their physical fitness, whether they are aiming for general health, athletic performance, or muscle growth. Strength training works by targeting specific muscle groups through a variety of exercises, including squats, deadlifts, bench presses, and overhead presses. These exercises can be done using free weights, machines, resistance bands, or even bodyweight exercises, making them adaptable to any fitness level or training goal.
One of the main benefits of strength training is its ability to increase muscle mass, which in turn boosts metabolism and promotes fat loss. It also plays a key role in improving bone density, reducing the risk of osteoporosis, and enhancing joint stability. Additionally, regular strength training has been shown to improve posture, balance, and overall functional movement, making everyday tasks easier and more efficient. For athletes, strength training is an essential component of their training regimen, as it enhances power, agility, and endurance, all of which contribute to better performance in their sport.
Strength training is also crucial for injury prevention. By strengthening muscles, ligaments, and tendons, it helps to protect the body from overuse injuries, strains, and sprains. This is particularly important for individuals who engage in repetitive physical activities, such as runners or cyclists. Furthermore, strength training can improve mental health by releasing endorphins, which are natural mood boosters. It has been linked to reduced symptoms of anxiety, depression, and stress, contributing to a more positive mindset and overall well-being. Whether you're new to exercise or a seasoned athlete, incorporating strength training into your routine can offer a wide range of physical and mental health benefits, helping you achieve your fitness goals and maintain a healthy lifestyle.
"""

with open("test.txt", "w") as f:
    f.write(raw_text)

In [158]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = TextLoader('test.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = FAISS.from_documents(documents, OpenAIEmbeddings())

In [159]:
db.similarity_search_with_relevance_scores("build muscle strength", k=1)

[(Document(id='35d8bb8e-16f3-4a01-a9b2-e0c3351e055d', metadata={'source': 'test.txt'}, page_content="Strength training, also known as resistance training, involves using weights or other forms of resistance to build muscle strength and endurance. This type of training is essential for anyone looking to improve their physical fitness, whether they are aiming for general health, athletic performance, or muscle growth. Strength training works by targeting specific muscle groups through a variety of exercises, including squats, deadlifts, bench presses, and overhead presses. These exercises can be done using free weights, machines, resistance bands, or even bodyweight exercises, making them adaptable to any fitness level or training goal.\nOne of the main benefits of strength training is its ability to increase muscle mass, which in turn boosts metabolism and promotes fat loss. It also plays a key role in improving bone density, reducing the risk of osteoporosis, and enhancing joint stabil

In [161]:
# Adding on extra documents directly within LangChain:
from langchain_core.documents import Document

docs = [Document(page_content='Marcell has been practicing sports and fitness for 5 years.', metadata={'source': 'Marcell Krausz'}),
        Document(page_content='Fitness is a rapidly growing industry.', metadata={'source': 'Wikipedia'}),
        ]

In [162]:
docs[0].metadata

{'source': 'Marcell Krausz'}

In [163]:
docs[1].metadata

{'source': 'Wikipedia'}

In [164]:
db.add_documents(docs)

['245bfeda-6762-427d-bbcd-6f1ae6f0cf34',
 '8df7558a-07c8-49cc-abbf-a3219b94e073']

In [165]:
db.similarity_search("Marcell", k=1)

[Document(id='245bfeda-6762-427d-bbcd-6f1ae6f0cf34', metadata={'source': 'Marcell Krausz'}, page_content='Marcell has been practicing sports and fitness for 5 years.')]

In [None]:
# `docker run -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -e "xpack.security.http.ssl.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.9.0`
# https://python.langchain.com/docs/how_to/indexing/

In [166]:
from langchain.indexes import SQLRecordManager, index
from langchain_elasticsearch import ElasticsearchStore

collection_name = "test_index"

embedding = OpenAIEmbeddings()

#store in the local docker elasticsearch
vectorstore = ElasticsearchStore(
    es_url="http://localhost:9200", index_name="test_index", embedding=embedding
)

In [167]:
namespace = f"elasticsearch/{collection_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)
record_manager.create_schema()

In [168]:
updated_docs = [
    Document(
        page_content="Marcell has been practicing sports and fitness since he was a child.",
        metadata={"source": "Marcell Krausz"},
    ),
   
]

In [169]:
def _clear():
    """Hacky helper method to clear content. See the `full` mode section to to understand why it works."""
    index([], record_manager, vectorstore, cleanup="full", source_id_key="source")

In [170]:
_clear()

In [171]:
docs

[Document(metadata={'source': 'Marcell Krausz'}, page_content='Marcell has been practicing sports and fitness for 5 years.'),
 Document(metadata={'source': 'Wikipedia'}, page_content='Fitness is a rapidly growing industry.')]

In [172]:
# Indexing all of the documents:
index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [173]:
# Updating a single document:
index(
    updated_docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 1}

In [174]:
# Adding on a new document:
index(
    [Document(page_content="Fitness is a rapidly growing industry.", metadata={"source": "Wikipedia - Fitness Industry"})],
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [177]:
# Skipping documents because the document hash is exactly the same:
index(
    [Document(page_content="Fitness is a rapidly growing industry.", metadata={"source": "Wikipedia - Fitness Industry"})],
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 0, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 0}

In [200]:
vectorstore.similarity_search("Marcell Krausz", k=1)

[Document(metadata={'source': 'Marcell Krausz'}, page_content='Marcell has been practicing sports and fitness since he was a child.')]