In [None]:
import json

# ===============================================
# A. CONFIGURATION
# ===============================================

# --- File and Chunking Settings ---
FILE_PATH = 'TBS_Handbook-2022_content_list.json'
MAX_CHUNK_SIZE = 500  # Target maximum characters per chunk
CHUNK_OVERLAP = 50    # Number of characters to overlap between chunks

# ===============================================
# B. DATA LOADING AND ERROR HANDLING
# ===============================================

print(f"--- 1. Loading Data from {FILE_PATH} ---")
json_blocks = []
try:
    with open(FILE_PATH, 'r') as f:
        # Load and parse the JSON file into a Python list of dictionaries
        json_blocks = json.load(f)
    print(f"✅ Success: Loaded {len(json_blocks)} blocks.")

except FileNotFoundError:
    print(f"❌ ERROR: File not found at '{FILE_PATH}'. Check file name and path in Colab.")
except json.JSONDecodeError:
    print("❌ ERROR: Failed to parse JSON content. The file may be corrupt.")

# ===============================================
# C. CHUNKING AND AGGREGATION FUNCTION
# ===============================================

def aggregate_blocks_into_chunks(blocks, max_size, overlap):
    """Aggregates small text blocks (text/table) into larger, context-rich chunks with overlap."""
    if not blocks:
        return []

    final_chunks = []
    current_chunk = ""
    # Initialize page index tracking with a safe default or the first block's page
    current_page_idx = blocks[0].get('page_idx', -1)

    for block in blocks:
        # 1. Extract Content: Handle both 'text' and 'table' types
        content = ""

        if block['type'] == 'text':
            content = block.get('text', '')
        elif block['type'] == 'table' and 'table_body' in block:
            # Format tables clearly for the embedding model to recognize structure
            table_data = block.get('table_body', '')
            content = f"\n\n[TABLE START, Page {block.get('page_idx')}]\n{table_data}\n[TABLE END]\n\n"

        if not content.strip():
            continue # Skip empty blocks

        # 2. Check for Page Break: A page break is a natural semantic boundary
        page_idx = block.get('page_idx', current_page_idx)
        if page_idx != current_page_idx and current_chunk:
            # Finalize the current chunk before the page change
            final_chunks.append({
                'text_chunk': current_chunk.strip(),
                'page_idx_start': current_page_idx # Record the start page of the finalized chunk
            })
            current_chunk = "" # Start clean on the new page

        current_page_idx = page_idx

        # 3. Aggregation and Size Check (The Core Logic)

        content_to_add = content.strip()

        # Check if adding the new content will exceed the max size
        if len(current_chunk) + len(content_to_add) > max_size and current_chunk:
            # Save the current chunk
            final_chunks.append({
                'text_chunk': current_chunk.strip(),
                'page_idx_start': current_page_idx
            })

            # Start a new chunk with overlap
            overlap_length = min(overlap, len(current_chunk))
            overlap_text = current_chunk[-overlap_length:].lstrip()

            # Start the new chunk with the overlap + the new content
            current_chunk = overlap_text + "\n" + content_to_add

        else:
            # Continue building the current chunk
            current_chunk += "\n" + content_to_add

    # 4. Save the final remaining chunk
    if current_chunk.strip():
        final_chunks.append({
            'text_chunk': current_chunk.strip(),
            'page_idx_start': current_page_idx
        })

    return final_chunks

# Execute the chunking function
final_chunks = aggregate_blocks_into_chunks(json_blocks, MAX_CHUNK_SIZE, CHUNK_OVERLAP)

# ===============================================
# D. OUTPUT AND PREPARATION FOR EMBEDDING
# ===============================================

print("\n--- 2. Chunking Results ---")
print(f"Total chunks generated: {len(final_chunks)}")
if final_chunks:
    print("\nExample Chunk (First 200 characters):")
    print("--------------------------------------------------")
    print(final_chunks[0]['text_chunk'][:200] + "...")
    print("--------------------------------------------------")

    # Extract the list of strings for the embedding model input
    texts_to_embed = [chunk['text_chunk'] for chunk in final_chunks]

    print("\n--- 3. Preparation for Vector Embeddings ---")
    print(f"List length ready for embedding (Step 2): {len(texts_to_embed)}")
    print(f"First text element: '{texts_to_embed[0][:50]}...'")
else:
    print("No chunks generated. Please resolve the file loading error in section B.")

--- 1. Loading Data from TBS_Handbook-2022_content_list.json ---
✅ Success: Loaded 282 blocks.

--- 2. Chunking Results ---
Total chunks generated: 155

Example Chunk (First 200 characters):
--------------------------------------------------
Educating to Lead
Ministry of Higher Education and Scientific Research University of Tunis
Tunis Business School
“Educating Future Leaders and Managers for a Global Economy”
SCHOOL HANDBOOK
Version: S...
--------------------------------------------------

--- 3. Preparation for Vector Embeddings ---
List length ready for embedding (Step 2): 155
First text element: 'Educating to Lead
Ministry of Higher Education and...'


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# 1. Define the model
# 'all-MiniLM-L6-v2' is fast, efficient, and generates 384-dimensional vectors.
# If you need higher precision, use 'all-mpnet-base-v2' (768-dim, but slower).
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'

# --- ASSUMPTION: Your list of chunks from the previous step is named 'final_chunks' ---
# The list of text strings ready for encoding:
# Example: texts_to_embed = [chunk['text_chunk'] for chunk in final_chunks]

# --- Placeholder for the list of texts from your chunking code ---
# Replace this with the actual output list from your previous cell
if 'final_chunks' in locals() and final_chunks:
    texts_to_embed = [chunk['text_chunk'] for chunk in final_chunks]
else:
    pass
# ------------------------------------------------------------------------------------

print(f"--- 1. Loading Model and Data ---")
model = SentenceTransformer(EMBEDDING_MODEL_NAME)
print(f"Model loaded: {EMBEDDING_MODEL_NAME}")
print(f"Total texts to encode: {len(texts_to_embed)}")

# 2. Generate the embeddings
print(f"\n--- 2. Generating Embeddings (This may take a moment) ---")
# The encode method converts the list of strings into a NumPy array of vectors
embeddings = model.encode(texts_to_embed, show_progress_bar=True)

print("✅ Embedding generation complete.")
print(f"Shape of the embeddings array: {embeddings.shape}")
# The shape is (Number of Chunks, Dimensionality), e.g., (150, 384)

# 3. Combine Data and Embeddings for Storage
# It's vital to keep the vectors linked to their original text and metadata.

# Create a DataFrame to hold all the data
data_df = pd.DataFrame(final_chunks)
data_df['embedding'] = list(embeddings) # Add the vector as a new column

print("\n--- 3. Combined Data Structure ---")
print(f"Final DataFrame Columns: {list(data_df.columns)}")
print(f"First chunk's embedding (first 5 dimensions):")
print(data_df['embedding'][0][:5])

--- 1. Loading Model and Data ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded: all-MiniLM-L6-v2
Total texts to encode: 155

--- 2. Generating Embeddings (This may take a moment) ---


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Embedding generation complete.
Shape of the embeddings array: (155, 384)

--- 3. Combined Data Structure ---
Final DataFrame Columns: ['text_chunk', 'page_idx_start', 'embedding']
First chunk's embedding (first 5 dimensions):
[-0.040027    0.10171638  0.01052209  0.02566179 -0.00840907]


In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
import numpy as np
import faiss
import pickle
import os

# --- Configuration (Based on your previous steps) ---
# NOTE: The dimensionality must match the model you chose (e.g., 384 for MiniLM-L6-v2)
D = 384  # Vector Dimensionality (Check your model's output shape)
# ---------------------------------------------------

print("--- 1. Preparing Vectors for Indexing ---")

# Ensure embeddings are a NumPy array of type float32 (FAISS requirement)
# 'embeddings' should be the NumPy array output from model.encode()
if 'embeddings' not in locals():
    print("❌ ERROR: 'embeddings' array not found. Please run Step 2 (Embedding Generation) first.")
    # Exiting or handling this case
    embeddings = np.array([])

if embeddings.size > 0:
    embeddings = embeddings.astype('float32')

    # 2. Initialize the FAISS Index
    print(f"--- 2. Building FAISS Index (IndexFlatL2) ---")

    # IndexFlatL2 is a simple, brute-force index perfect for starting out.
    # It measures similarity using Euclidean distance (L2).
    index = faiss.IndexFlatL2(D)

    # 3. Add the vectors to the index
    index.add(embeddings)

    print(f"✅ FAISS Index built successfully.")
    print(f"Total vectors added to the index: {index.ntotal}")

    # 4. Save the Index and Metadata

    # Save the FAISS index itself
    faiss_index_file = 'handbook_faiss_index.bin'
    faiss.write_index(index, faiss_index_file)
    print(f"\nSaved FAISS index to: {faiss_index_file}")

    # Save the metadata (text chunks, page numbers, etc.) separately.
    # We use pickle to save the DataFrame (excluding the large embedding vector column)
    metadata_file = 'handbook_metadata.pkl'

    # We don't need to save the embedding column in the metadata file, as it's in FAISS.
    metadata_to_save = data_df.drop(columns=['embedding'])
    with open(metadata_file, 'wb') as f:
        pickle.dump(metadata_to_save, f)

    print(f"Saved metadata to: {metadata_file}")

else:
    print("Index build skipped because the 'embeddings' array is empty.")

--- 1. Preparing Vectors for Indexing ---
--- 2. Building FAISS Index (IndexFlatL2) ---
✅ FAISS Index built successfully.
Total vectors added to the index: 155

Saved FAISS index to: handbook_faiss_index.bin
Saved metadata to: handbook_metadata.pkl


In [None]:
# --- Example Retrieval Code ---

# 1. Load the model and index
retrieval_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
retrieval_index = faiss.read_index(faiss_index_file)
with open(metadata_file, 'rb') as f:
    retrieval_metadata = pickle.load(f)

# 2. Define the Query
user_query = "What are the finance courses available at tunis business school ?"
K = 3 # Number of top results to retrieve

# 3. Encode the query
query_vector = retrieval_model.encode([user_query]).astype('float32')

# 4. Search the index
# D = Distances (similarity scores), I = Indices (row numbers in your metadata)
D, I = retrieval_index.search(query_vector, K)

print(f"\n--- 4. Retrieval Results for: '{user_query}' ---")
print(f"Top {K} results found:")

# 5. Display the results
for i, index_id in enumerate(I[0]):
    # Use the index_id to look up the metadata (text chunk and page number)
    chunk = retrieval_metadata.iloc[index_id]

    print(f"\nResult {i+1} (Page: {chunk['page_idx_start']}):")
    print(f"Similarity Score (L2 Distance): {D[0][i]:.4f}")
    print(f"Content:\n{chunk['text_chunk'][:]}...") # Print the first 300 chars


--- 4. Retrieval Results for: 'What are the finance courses available at tunis business school ?' ---
Top 3 results found:

Result 1 (Page: 4):
Similarity Score (L2 Distance): 0.6493
Content:
ns to succeed in the evolving international arena.
Tunis Business School offers a Bachelor of Science in Business Administration (Higher Education Law about the recognition of the Bachelor of Science Degree Diploma $ { \mathbf { n } } ^ { \circ } 2 0 1 7 - 3 8 $ of Mai 2, 2017, and Government decree $\mathsf { n } ^ { \circ } 2 0 1 9  – 1 6 2$ of February 18, 2019). It is a 4-year study program. The curriculum covers the following specializations: accounting, business analytics, finance, information technology, International Business Economics, and marketing....

Result 2 (Page: 2):
Similarity Score (L2 Distance): 0.8003
Content:
TABLE OF CONTENTS
DISCLAIMER.
TABLE OF CONTENTS . 3
1. ABOUT TUNIS BUSINESS SCHOOL ....

Result 3 (Page: 28):
Similarity Score (L2 Distance): 0.8091
Content:
dents (All 