In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

# Access your variables
hf_api_key = os.getenv("HUGGINGFACE_API_KEY")

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load your file
with open("../data/dxfactor_full_scrape.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)

chunks = splitter.split_text(full_text)
print(f"✅ Total chunks: {len(chunks)}")

✅ Total chunks: 158


In [3]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

embedding_model = HuggingFaceInferenceAPIEmbeddings(
    api_key=hf_api_key,
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Embed your chunks
embeddings = embedding_model.embed_documents(chunks)

print(f"✅ Embeddings shape: {len(embeddings)} vectors of length {len(embeddings[0])}")

✅ Embeddings shape: 158 vectors of length 384


In [4]:
from langchain.vectorstores import FAISS

# Create vector store
vectorstore = FAISS.from_texts(chunks, embedding_model)

In [5]:
# One folder up from the current directory
SAVE_DIR = os.path.abspath(os.path.join(os.getcwd(), "../vectorstores"))
os.makedirs(SAVE_DIR, exist_ok=True)

OUTPUT_FILE = os.path.join(SAVE_DIR, "dxfactor")

In [6]:
# Save to local disk
vectorstore.save_local(OUTPUT_FILE)

In [7]:
vectorstore = FAISS.load_local("../vectorstores/dxfactor", embedding_model, allow_dangerous_deserialization=True)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x26e4eaf7d50>

In [8]:
retriever = vectorstore.as_retriever(search_type="similarity", k=3)
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceInferenceAPIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000026E4EAF7D50>, search_kwargs={})