In [None]:
import getpass
import os
from langchain.chat_models import init_chat_model
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from dotenv import load_dotenv
from langchain.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres import PGVector

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# Load .env file and override existing environment variables
load_dotenv(override=True)
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
print("Google API Key:", os.environ['GOOGLE_API_KEY'])

vector_store = PGVector(
    embeddings=embeddings,
    collection_name="my_docs",
    connection="postgresql+psycopg://postgres:password@localhost:5432/clapnq",
)
# vector_store.delete_collection()

In [None]:
def remove_wikipedia_footer(text: str) -> str:
    footer_markers = [
        "Retrieved from", "Categories :", "External links ( edit )",
        "Text is available under the Creative Commons",
        "By using this site", "About Wikipedia"
    ]
    for marker in footer_markers:
        if marker in text:
            text = text.split(marker)[0]
    return text.strip()

import re

def remove_inline_templates(text: str) -> str:
    # Remove '( edit )' and other template garbage
    text = re.sub(r"\(\s*edit\s*\)", "", text)
    text = re.sub(r"\( Learn how and when to.*?\)", "", text)
    text = re.sub(r"\(.*?template message.*?\)", "", text)
    return text

def clean_wikipedia_plaintext(text: str) -> str:
    text = remove_wikipedia_footer(text)
    text = remove_inline_templates(text)

    # Optional: Remove repeated line breaks and extra spaces
    text = re.sub(r"\n{2,}", "\n", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=150,
    # separators=["\n\n", "\n", ".", " "],  # good for Wiki-style text
)

# chunks = text_splitter.split_documents(docs)

In [None]:
#load json file
import json
from langchain.schema import Document
file_path = 'rag_to_be_tested-Data/clapnq_dev_answerable_orig.jsonl'
with open(file_path, "r") as f:
        json_lines = [json.loads(line) for line in f]
index = 0
total_docs = len(json_lines)
for item in json_lines:
    raw_text = item["document_plaintext"]
    cleaned_text = clean_wikipedia_plaintext(raw_text)
    # print(cleaned_text)
    docs = Document(
        page_content=cleaned_text,
        metadata={
            "title": item.get("document_title", ""),
            "url": item.get("document_url", ""),
            "example_id": item.get("example_id", ""),
            "language": item.get("language", "")
        }
    )

    chunks = text_splitter.split_documents([docs])
    # print(chunks[0])

    documents = [
        chunk for chunk in chunks
        if len(chunk.page_content) > 30 and any(c.isalpha() for c in chunk.page_content)
    ]
    print(f'Document {index} out of total {total_docs} processed.')
    index = index + 1
    vector_store.add_documents(documents)
