## Data Ingestion

In [84]:
# Langchain dependencies
from langchain.document_loaders.pdf import PyPDFDirectoryLoader # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
from langchain.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
from langchain.chat_models import ChatOpenAI # Import OpenAI LLM
import os # Importing os module for operating system functionalities
import shutil # Importing shutil module for high-level file operations

In [113]:
import os

data_path = "..\sagemaker_documentation"
if not os.path.exists(data_path):
    print(f"The directory {data_path} does not exist.")
else:
    print(f"The directory {data_path} exists.")

The directory ..\sagemaker_documentation exists.


In [116]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document

# Relative path to go up one directory
data_path = "../sagemaker_documentation"
loader = UnstructuredMarkdownLoader(data_path)
data = loader.load()

PermissionError: [Errno 13] Permission denied: '..\\sagemaker_documentation'

In [107]:
import os
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\LENOVO\PycharmProjects\researchassistant\notebooks


In [82]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

loader = DirectoryLoader("../", glob="**/*.md", loader_cls=TextLoader)
docs = loader.load()
print(f"Number of files loaded: {len(docs)}")

Number of files loaded: 336


In [83]:
print(docs[0])

page_content='# Using the SageMaker Training and Inference Toolkits<a name="amazon-sagemaker-toolkits"></a>

The [SageMaker Training](https://github.com/aws/sagemaker-training-toolkit) and [SageMaker Inference](https://github.com/aws/sagemaker-inference-toolkit) toolkits implement the functionality that you need to adapt your containers to run scripts, train algorithms, and deploy models on SageMaker\. When installed, the library defines the following for users:
+ The locations for storing code and other resources\. 
+ The entry point that contains the code to run when the container is started\. Your Dockerfile must copy the code that needs to be run into the location expected by a container that is compatible with SageMaker\. 
+ Other information that a container needs to manage deployments for training and inference\. 

## SageMaker Toolkits Containers Structure<a name="sagemaker-toolkits-structure"></a>

When SageMaker trains a model, it creates the following file folder structure i

In [74]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

# Iterate over the list of documents and split the text
all_chunks = []
for doc in docs:
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    chunks = markdown_splitter.split_text(doc.page_content)
    all_chunks.extend(chunks)

# Now `all_chunks` contains all the individual text chunks from the documents
print(all_chunks)



In [79]:
import os
import openai

# Retrieve the OpenAI API key from the environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

# Check if the API key was successfully retrieved
if openai.api_key is None:
    raise ValueError("OpenAI API key not found. Please set the environment variable OPENAI_API_KEY.")

In [80]:
type(all_chunks)

list

In [81]:
from langchain.embeddings import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
from langchain.text_splitter import MarkdownTextSplitter  # Assuming you're using Markdown splitter
from langchain.docstore.document import Document

# Example: List of langchain Document objects
documents = [
    Document(page_content="## Title 1\nThis is the first chunk of text from document 1.", metadata={"source": "doc1"}),
    Document(page_content="## Title 2\nThis is the second chunk of text from document 1.", metadata={"source": "doc1"}),
    Document(page_content="## Title 3\nThis is the first chunk of text from document 2.", metadata={"source": "doc2"}),
    Document(page_content="## Title 4\nThis is the second chunk of text from document 2.", metadata={"source": "doc2"})
]

# Initialize the Markdown splitter
markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=20)

# Iterate over the list of documents and split the text
all_chunks = []
for doc in all_chunks:
    chunks = markdown_splitter.split_text(doc.page_content)  # Split the content of each document
    all_chunks.extend(chunks)  # Add the chunks to the all_chunks list

# Now `all_chunks` contains all the individual text chunks from the documents
print(all_chunks)


In [20]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained local model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
embeddings = [model.encode(chunk) for chunk in chunks]

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [24]:
type(embeddings[0])

numpy.ndarray

In [25]:
import numpy as np
# Save embeddings
np.save("embeddings.npy", embeddings)

### Loading saved embeddings

In [26]:
# Load embeddings later
embeddings = np.load("embeddings.npy", allow_pickle=True)

In [31]:
import numpy as np

# Convert the list of embeddings to a NumPy array
embedding_matrix = np.vstack(embeddings).astype('float32')  # Shape: (num_chunks, embedding_dim)

In [32]:
import faiss

# Defining the dimension of the embeddings
embedding_dim = embedding_matrix.shape[1]

# Creating a FAISS index
index = faiss.IndexFlatL2(embedding_dim)
index.add(embedding_matrix)

print(f"Number of embeddings in the index: {index.ntotal}")

Number of embeddings in the index: 3


In [36]:
# Save the index to disk
#faiss.write_index(index, "vector_store.index")

In [37]:
# Load the index back
index = faiss.read_index("vector_store.index")
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000020649D5AEB0> >

In [40]:
header_splits

[Document(metadata={'Header 1': 'Using the SageMaker Training and Inference Toolkits<a name="amazon-sagemaker-toolkits"></a>'}, page_content='The [SageMaker Training](https://github.com/aws/sagemaker-training-toolkit) and [SageMaker Inference](https://github.com/aws/sagemaker-inference-toolkit) toolkits implement the functionality that you need to adapt your containers to run scripts, train algorithms, and deploy models on SageMaker\\. When installed, the library defines the following for users:\n+ The locations for storing code and other resources\\.\n+ The entry point that contains the code to run when the container is started\\. Your Dockerfile must copy the code that needs to be run into the location expected by a container that is compatible with SageMaker\\.\n+ Other information that a container needs to manage deployments for training and inference\\.'),
 Document(metadata={'Header 1': 'Using the SageMaker Training and Inference Toolkits<a name="amazon-sagemaker-toolkits"></a>',

In [23]:
vectorstore.save_local('vector/')

In [24]:
vectorstore= FAISS.load_local('vector/', embeddings_model, allow_dangerous_deserialization=True)

In [25]:
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x24d01c8c650>