In [2]:
import pandas as pd

json_dev = pd.read_json("data/dev-v2.0.json")
json_train = pd.read_json("data/train-v2.0.json")

In [3]:
def data_extraction(json_data):
    extracted_data = []

    # Iterate through each item in the 'data' column
    for item in json_data['data']:
        title = item['title']  # Access 'title' within the item
        for paragraph in item['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                # Assuming we take the first answer provided for each question
                if qa['answers']:
                    answer_text = qa['answers'][0]['text']
                    answer_start = qa['answers'][0]['answer_start']
                    answer_end = answer_start + len(answer_text)
                else:
                    # Handle cases where there might be no answers (e.g., for is_impossible=True)
                    answer_text = None
                    answer_start = None
                    answer_end = None

                extracted_data.append({
                    'title': title,
                    'context': context,
                    'question': question,
                    'answer': answer_text,
                    'answer_start': answer_start,
                    'answer_end': answer_end
                })

    return pd.DataFrame(extracted_data)

In [4]:
dev_columns = data_extraction(json_dev)
train_columns = data_extraction(json_train)

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def create_recursive_chunks(df, chunk_size=256, chunk_overlap=32):
    """
    Splits the unique contexts of a DataFrame into semantic chunks using
    the recommended RecursiveCharacterTextSplitter.

    Args:
        df (pd.DataFrame): DataFrame containing 'title' and 'context' columns.
        chunk_size (int): The target size for each chunk in characters.
        chunk_overlap (int): The number of characters to overlap between chunks.

    Returns:
        pd.DataFrame: A new DataFrame with columns ['title', 'context_chunk', 'chunk_id'].
    """
    # 1. Initialize the RecursiveCharacterTextSplitter
    # This splitter attempts to split text based on a prioritized list of separators.
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        # This list of separators is key. It tries to split by paragraph,
        # then by sentence, then by space, ensuring chunks are as
        # semantically coherent as possible.
        separators=["\n\n", "\n", ". ", " ", ""],
        length_function=len
    )

    chunked_data = []
    
    # 2. Process only unique contexts to avoid redundant work
    unique_contexts_df = df.drop_duplicates(subset=['title', 'context']).reset_index(drop=True)

    # 3. Iterate over each unique document
    for index, row in unique_contexts_df.iterrows():
        title = row['title']
        context = row['context']
        
        # 4. Use the splitter to create chunks from the context
        chunks = splitter.split_text(context)

        # 5. Create a new record for each chunk
        for i, chunk_text in enumerate(chunks):
            chunked_data.append({
                'title': title,
                'context_chunk': chunk_text,
                'chunk_id': f"{index}_{i}" # A unique ID for tracking each chunk
            })

    return pd.DataFrame(chunked_data)

# --- Parameters and Execution ---
CHUNK_SIZE = 256
CHUNK_OVERLAP = 32

# Call the function to create the new chunked DataFrame
df_chunked_for_retrieval = create_recursive_chunks(dev_columns, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

# --- Display the results ---
print(f"Total chunks created for retrieval: {len(df_chunked_for_retrieval)}\n")
print("Sample of the new chunked DataFrame:")
print(df_chunked_for_retrieval)


  from .autonotebook import tqdm as notebook_tqdm



Total chunks created for retrieval: 5612

Sample of the new chunked DataFrame:
        title                                      context_chunk chunk_id
0     Normans  The Normans (Norman: Nourmands; French: Norman...      0_0
1     Normans  . They were descended from Norse ("Norman" com...      0_1
2     Normans  . Through generations of assimilation and mixi...      0_2
3     Normans  . The distinct cultural and ethnic identity of...      0_3
4     Normans  The Norman dynasty had a major political, cult...      1_0
...       ...                                                ...      ...
5607    Force  . According to the Second law of thermodynamic...   1202_2
5608    Force  The pound-force has a metric counterpart, less...   1203_0
5609    Force  . The kilogram-force leads to an alternate, bu...   1203_1
5610    Force  . The kilogram-force is not a part of the mode...   1203_2
5611    Force  . Other arcane units of force include the sthè...   1203_3

[5612 rows x 3 columns]


In [6]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import pickle
import os

# --- 1. Choose and Load Embedding Model ---
print("Loading sentence transformer model: all-MiniLM-L6-v2...")
model_name = 'all-MiniLM-L6-v2'
# We recommend using a GPU for encoding, so specify 'cuda' if available
# If not, 'cpu' will be used automatically.
model = SentenceTransformer(model_name, device='cuda' if 'cuda' in os.environ.get('CUDA_VISIBLE_DEVICES', '') else 'cpu')
print("Model loaded successfully.")


# --- 2. Encode All Chunks ---
# Extract the text chunks to be encoded
chunks_to_encode = df_chunked_for_retrieval['context_chunk'].tolist()

print(f"Encoding {len(chunks_to_encode)} chunks...")
# The model.encode() method converts text into a NumPy array of embeddings
# show_progress_bar=True provides a visual indicator of progress.
embeddings = model.encode(chunks_to_encode, show_progress_bar=True)
print("Encoding complete.")
print(f"Shape of embeddings array: {embeddings.shape}")


# --- 3. Store Embeddings in NumPy Array (already done by model.encode) ---
# The 'embeddings' variable is already the NumPy array you need.


# --- 4. Build FAISS Index ---
print("Building FAISS index...")
# Get the dimensionality of the embeddings
embedding_dim = embeddings.shape[1]

# For 'all-MiniLM-L6-v2', embeddings are normalized.
# This means Inner Product (IP) is equivalent to Cosine Similarity.
# IndexFlatIP is a fast index for this purpose.
index = faiss.IndexFlatIP(embedding_dim)

# Add the embeddings to the index
index.add(embeddings)
print(f"FAISS index built. Total vectors in index: {index.ntotal}")


# --- 5. Save the Artifacts ---
print("Saving artifacts to disk...")

# Create directories if they don't exist
os.makedirs('models', exist_ok=True)
os.makedirs('data', exist_ok=True)

# Save embeddings array
with open('models/embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)
print("- Saved embeddings to models/embeddings.pkl")

# Save FAISS index
faiss.write_index(index, 'models/faiss.index')
print("- Saved FAISS index to models/faiss.index")

# Save metadata (the chunked DataFrame)
# This is crucial for mapping search results back to the original text.
df_chunked_for_retrieval.to_pickle('data/metadata.pkl')
print("- Saved metadata to data/metadata.pkl")

print("\nAll steps completed successfully!")


Loading sentence transformer model: all-MiniLM-L6-v2...
Model loaded successfully.
Encoding 5612 chunks...


Batches: 100%|██████████| 176/176 [00:37<00:00,  4.65it/s]

Encoding complete.
Shape of embeddings array: (5612, 384)
Building FAISS index...
FAISS index built. Total vectors in index: 5612
Saving artifacts to disk...
- Saved embeddings to models/embeddings.pkl
- Saved FAISS index to models/faiss.index
- Saved metadata to data/metadata.pkl

All steps completed successfully!





In [7]:
import os
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
import faiss
from typing import List, Dict, Any

def create_and_save_faiss_index(
    chunks: List[str],
    metadata: List[Dict[str, Any]],
    model_name: str = 'all-MiniLM-L6-v2',
    embeddings_path: str = 'models/embeddings.pkl',
    faiss_index_path: str = 'models/faiss.index',
    metadata_path: str = 'data/metadata.pkl'
) -> None:
    """
    Encodes text chunks, builds a FAISS index for cosine similarity,
    and saves the embeddings, FAISS index, and metadata.

    Args:
        chunks (List[str]): A list of text strings to be embedded.
        metadata (List[Dict[str, Any]]): A list of dictionaries, where each dictionary
                                          corresponds to the metadata for a chunk.
        model_name (str): The name of the SentenceTransformer model to use.
        embeddings_path (str): The file path to save the embeddings NumPy array.
        faiss_index_path (str): The file path to save the FAISS index.
        metadata_path (str): The file path to save the metadata.
    """

    # 1. Choose embedding model
    print(f"Loading embedding model: {model_name}...")
    model = SentenceTransformer(model_name)
    print("Model loaded successfully.")

    # Create directories if they don't exist
    os.makedirs(os.path.dirname(embeddings_path), exist_ok=True)
    os.makedirs(os.path.dirname(metadata_path), exist_ok=True)

    # 2. Encode all chunks
    print(f"Encoding {len(chunks)} chunks...")
    embeddings = model.encode(chunks, show_progress_bar=True)
    print("Chunks encoded.")

    # 3. Store embeddings in NumPy array
    # Ensure embeddings are float32, which is standard for FAISS
    embeddings_np = np.array(embeddings).astype('float32')
    print(f"Embeddings shape: {embeddings_np.shape}")

    # Get embedding dimension
    d = embeddings_np.shape[1]

    print("Normalizing embeddings for cosine similarity...")
    faiss.normalize_L2(embeddings_np)
    print("Embeddings normalized.")

    print(f"Building FAISS IndexFlatL2 with dimension {d}...")
    index = faiss.IndexFlatL2(d)

    num_vectors = embeddings_np.shape[0]
    index.add(x=embeddings_np)

    print(f"FAISS index built with {index.ntotal} vectors.")
    # 5. Save:
    # 5.1. Save embeddings.pkl
    print(f"Saving embeddings to {embeddings_path}...")
    with open(embeddings_path, 'wb') as f:
        pickle.dump(embeddings_np, f)
    print("Embeddings saved.")

    # 5.2. Save faiss.index
    print(f"Saving FAISS index to {faiss_index_path}...")
    faiss.write_index(index, faiss_index_path)
    print("FAISS index saved.")

    # 5.3. Save metadata.pkl
    print(f"Saving metadata to {metadata_path}...")
    with open(metadata_path, 'wb') as f:
        pickle.dump(metadata, f)
    print("Metadata saved.")

    print("\nProcess completed successfully!")

if __name__ == "__main__":
    import pickle

    # Define the file path
    file_path = 'data/metadata.pkl'

    # Open the file in binary read mode and load the data
    with open(file_path, 'rb') as file:
        metadata = pickle.load(file)

    # Now 'loaded_list' is a Python list containing the data from the .pkl file
    print(type(metadata))
    print(metadata)

    create_and_save_faiss_index(
        chunks=df_chunked_for_retrieval['context_chunk'].tolist(),
        metadata=metadata
    )

    # --- Verification (Optional) ---
    print("\n--- Verifying saved files ---")
    try:
        # Load embeddings
        with open('models/embeddings.pkl', 'rb') as f:
            loaded_embeddings = pickle.load(f)
        print(f"Loaded embeddings shape: {loaded_embeddings.shape}")

        # Load FAISS index
        loaded_index = faiss.read_index('models/faiss.index')
        print(f"Loaded FAISS index total vectors: {loaded_index.ntotal}")

        # Load metadata
        with open('data/metadata.pkl', 'rb') as f:
            loaded_metadata = pickle.load(f)
        print(f"Loaded metadata count: {len(loaded_metadata)}")

        # Example search (optional)
        query_text = "What is AI?"
        query_embedding = SentenceTransformer('all-MiniLM-L6-v2').encode([query_text]).astype('float32')
        faiss.normalize_L2(query_embedding) # Normalize query embedding as well

        k = 3 # Number of nearest neighbors to retrieve
        distances, indices = loaded_index.search(query_embedding, k)

        print(f"\nTop {k} results for query: '{query_text}'")
        for i in range(k):
            print(f"  Rank {i+1}: Chunk ID {loaded_metadata[indices[0][i]]['id']} (Distance: {distances[0][i]:.4f})")
            print(f"    Text: {df_chunked_for_retrieval['context_chunk'].tolist()[indices[0][i]]}")
            print(f"    Metadata: {loaded_metadata[indices[0][i]]}")

    except Exception as e:
        print(f"Error during verification: {e}")



<class 'pandas.core.frame.DataFrame'>
        title                                      context_chunk chunk_id
0     Normans  The Normans (Norman: Nourmands; French: Norman...      0_0
1     Normans  . They were descended from Norse ("Norman" com...      0_1
2     Normans  . Through generations of assimilation and mixi...      0_2
3     Normans  . The distinct cultural and ethnic identity of...      0_3
4     Normans  The Norman dynasty had a major political, cult...      1_0
...       ...                                                ...      ...
5607    Force  . According to the Second law of thermodynamic...   1202_2
5608    Force  The pound-force has a metric counterpart, less...   1203_0
5609    Force  . The kilogram-force leads to an alternate, bu...   1203_1
5610    Force  . The kilogram-force is not a part of the mode...   1203_2
5611    Force  . Other arcane units of force include the sthè...   1203_3

[5612 rows x 3 columns]
Loading embedding model: all-MiniLM-L6-v2...
Mode

Batches: 100%|██████████| 176/176 [00:49<00:00,  3.56it/s]


Chunks encoded.
Embeddings shape: (5612, 384)
Normalizing embeddings for cosine similarity...
Embeddings normalized.
Building FAISS IndexFlatL2 with dimension 384...
FAISS index built with 5612 vectors.
Saving embeddings to models/embeddings.pkl...
Embeddings saved.
Saving FAISS index to models/faiss.index...
FAISS index saved.
Saving metadata to data/metadata.pkl...
Metadata saved.

Process completed successfully!

--- Verifying saved files ---
Loaded embeddings shape: (5612, 384)
Loaded FAISS index total vectors: 5612
Loaded metadata count: 5612

Top 3 results for query: 'What is AI?'
Error during verification: np.int64(183)


In [9]:
import pandas as pd

chunks_file = df_chunked_for_retrieval['context_chunk']
chunks_file.to_csv('output_pandas.csv', index=False)