In [None]:
import os
import json
import shutil
import pickle

import wikipediaapi

from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


### Extract Wikipedia pages

In [None]:
def extract_wikipedia_page(page_name, language='he'):
    """
    Extracts content from a Wikipedia page by its name.

    Parameters:
        page_name (str): The name of the Wikipedia page.
        language (str): The language code of Wikipedia (default is 'he' for Hebrew).

    Returns:
        dict: A dictionary containing the title, summary, URL, and sections of the page.
    """
    # Define the Wikipedia instance
    wiki = wikipediaapi.Wikipedia(
        language=language,
        user_agent='BenGurionBot/1.0 (noa20808@gmail.com)'
    )

    # Retrieve the page by its name
    page = wiki.page(page_name)

    # Check if the page exists
    if not page.exists():
        raise ValueError(f"The page '{page_name}' was not found on Wikipedia.")

    # Split the content by headings
    def extract_sections(page):
        sections_data = {}
        def recurse_sections(sections, container):
            for section in sections:
                container[section.title] = {
                    "text": section.text,
                    "subsections": {}
                }
                recurse_sections(section.sections, container[section.title]["subsections"])

        recurse_sections(page.sections, sections_data)
        return sections_data

    # Create the JSON structure
    data = {
        "title": page.title,
        "summary": page.summary,
        "url": page.fullurl,
        "sections": extract_sections(page)
    }

    return data

def save_wikipedia_page_data(page_name, language='he', output_dir="./data"):
    """
    Saves the extracted Wikipedia page data as a JSON file.

    Parameters:
        page_name (str): The name of the Wikipedia page.
        language (str): The language code of Wikipedia (default is 'he' for Hebrew).
        output_dir (str): The directory where the JSON file will be saved.
    """
    # Extract the Wikipedia page data
    data = extract_wikipedia_page(page_name, language)

    # Define the output file path
    output_path = os.path.join(output_dir, f"{page_name.replace(' ', '_')}_wiki.json")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Save the data to a JSON file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Wiki JSON saved to {output_path}")

In [None]:
save_wikipedia_page_data('דוד בן-גוריון')

Wiki JSON saved to ./data/ben_gurion_wiki.json


In [None]:
def split_text(text, max_words=260, overlap=35):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i+max_words]
        chunks.append(" ".join(chunk))
        i += max_words - overlap
    return chunks

def wiki_json_to_chunks(json_path, source_id="wikipedia_json", max_words=260, overlap=35, start_idx=0):
    with open(json_path, encoding='utf-8') as f:
        wiki_data = json.load(f)

    docs = []
    current_idx = start_idx  # will be captured by inner function

    def recurse_sections(sections, parent_title=None):
        nonlocal current_idx
        for title, content in sections.items():
            full_title = f"{parent_title} > {title}" if parent_title else title
            text = content.get("text", "").strip()
            chunks = split_text(text, max_words, overlap)
            for chunk in chunks:
                docs.append(Document(
                    page_content=chunk,
                    metadata={
                        "source": source_id,
                        "section": full_title,
                        "idx": current_idx
                    }
                ))
                current_idx += 1
            recurse_sections(content.get("subsections", {}), full_title)

    recurse_sections(wiki_data["sections"])
    return docs

In [None]:
# Extract the existing chunks from the pickle file to find the max index
with open('./data/chunks.pkl', 'rb') as f:
    chunks = pickle.load(f)

max_existing_idx = max(doc.metadata["idx"] for doc in chunks)

wiki_docs = wiki_json_to_chunks(
    json_path="./data/ben_gurion_wiki.json",
    start_idx=max_existing_idx + 1
)


### Add the new extracted JSON to the VDB

In [None]:
api_key = "YOUR-API-KEY"
os.environ["OPENAI_API_KEY"] = api_key

In [None]:
def create_augmented_vectorstore_from_json(
    original_index_path,
    output_index_path,
    wiki_chunks,
    embedding_model=OpenAIEmbeddings(model="text-embedding-ada-002")
):
    # 1. Copy the original index
    if os.path.exists(output_index_path):
        raise FileExistsError(f"{output_index_path} already exists. Choose a new path.")
    shutil.copytree(original_index_path, output_index_path)

    # 2. Load the new index
    vectorstore = FAISS.load_local(output_index_path, embeddings=embedding_model, allow_dangerous_deserialization=True)

    # 3. Add to the new index
    vectorstore.add_documents(wiki_chunks)
    vectorstore.save_local(output_index_path)

    print(f"✅ Augmented vectorstore created at {output_index_path} with {len(wiki_chunks)} wiki chunks.")

2025-05-09 15:07:02.889800: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746792422.912760 1102184 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746792422.919102 1102184 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746792422.935036 1102184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746792422.935051 1102184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746792422.935053 1102184 computation_placer.cc:177] computation placer alr

In [None]:
create_augmented_vectorstore_from_json("./faiss_index_openai", "./faiss_index_openai_copy", wiki_docs)

✅ Augmented vectorstore created at ./faiss_alephbert_index_copy with 125 wiki chunks.


In [9]:
combined_chunks = chunks + wiki_docs

with open('./data/combined_chunks.pkl', 'wb') as f:
    pickle.dump(combined_chunks, f)