### Load  JSON data

In [None]:
import json
from tqdm import tqdm

# Read the JSON data
with open('../data/data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Process each item with a progress bar
for item in tqdm(data, desc="Processing items"):
    print(f"Processing item: {item['name']}")

# The total number of items
print(f"Total items: {len(data)}")

In [None]:
# init env variables
import os
from dotenv import load_dotenv

load_dotenv(override=True)

### Load Documents

In [None]:
import sys
sys.path.append('..')
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_core.documents import Document
from app_utils import generate_key

In [None]:
data[5]['metadata']

In [None]:
# for each items load the documents from the 'path' key depending on the file type text or pdf
def load_documents(item) -> list[Document]:
    type = item.get("type")
    path = f'../{item.get("path")}'
    if type == "text":
        loader = TextLoader(path)
    elif type == "pdf":
        loader = PyPDFLoader(path)
    else:
        raise ValueError(f"Unsupported file type: {path}")
    documents = loader.load()

    # Add metadata to each document (create a copy to avoid modifying original data)
    metadata = item.get("metadata", {}).copy()

    # The metadata technologies is a list, convert it to a comma-separated string
    if "technologies" in metadata and isinstance(metadata["technologies"], list):
        technologies = metadata["technologies"]
        metadata["technologies"] = ", ".join(technologies)

        # add each technology as a separate metadata key with value True
        for tech in technologies:
            metadata[generate_key(tech)] = True

    for doc in documents:
        doc.metadata.update(metadata)

    return documents

print(data[5]['metadata'])
load_documents(data[5])

In [None]:
documents = []
for item in tqdm(data, desc="Loading documents", unit="item"):
    path = f'../{item.get("path")}'
    type = item.get("type")
    if path and type:
        try:
            documents.extend(load_documents(item))
            #print(f"Loaded {len(documents)} documents from {path}")
        except Exception as e:
            print(f"Error loading documents from {path}: {e}")
    else:
        print(f"No valid path or type for item: {item['name']}")

print(f"Total documents loaded: {len(documents)}")

In [None]:
documents[5].metadata

In [None]:
# Text spliting
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)
all_chunks = text_splitter.split_documents(documents)

print(f"Number of text chunks created: {len(all_chunks)}")

### Embedding and Vector Store

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

db_name = "../chroma_db"
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Delete existing ChromaDB database folder if it exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
    print(f"Deleted existing database folder: {db_name}")

# Create and persist the ChromaDB database
vectordb = Chroma(
    collection_name="freelance_data",
    embedding_function=embeddings,
    persist_directory=db_name,
)

# index the documents
ids = vectordb.add_documents(all_chunks)
print(f"Number of documents indexed: {len(ids)}")

### Tests

In [None]:
results = vectordb.similarity_search("Example of one AI project", k=2)
results

In [None]:
retriever = vectordb.as_retriever(
    search_kwargs={"k": 2},
    search_type="similarity"
)

retriever.batch(
    [
        "Who is he?",
        "What is Lo√Øc's current position?",
    ],
)

In [None]:
# Filter metadata example
filtered_results = vectordb.similarity_search(
    "AI project",
    k=2,
    filter={"category": "About Me"}
)
filtered_results

In [None]:
key1 = generate_key("AI")
key2 = generate_key("Flutter")

filtered_results = vectordb.similarity_search(
    "Flutter",
    k=2,
    filter={'$and': [{key1: True}, {key2: True}]}
)
filtered_results

In [None]:
# Use the LangChain Chroma API to query with filters
results = vectordb.similarity_search(
    "Exemple de projet d'IA",
    k=10,
    filter={'$or': [{'ai': True}, {'dart': True}]}
)
results