In [1]:
import os
import shutil
import time
import numpy as np
import sys
from IPython.display import display, Markdown, HTML

In [2]:
# Build an absolute path from this notebook's parent directory
module_path = os.path.abspath(os.path.join('..','src'))

# Add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)

# Developed components
from document_processor import DocumentProcessor
from vector_db import ChromaVectorStore
from retrieval import ChromaRetrievalSystem
from question_generator import QuestionGenerator

In [3]:
# Create a clean test directory for ChromaDB
test_dir = "test_integration_data"
if os.path.exists(test_dir):
    shutil.rmtree(test_dir)
os.makedirs(test_dir, exist_ok=True)

### Initialize components
---

In [4]:
# Initialize the document processor
document_processor = DocumentProcessor(
    embedding_model="all-MiniLM-L6-v2",
    chunk_size=300,  # Adjust chunk size as needed
    chunk_overlap=50
)

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.48.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

In [5]:
# Initialize ChromaDB vector store
vector_store = ChromaVectorStore(
    collection_name="test_collection",
    persist_directory=test_dir
)

Collection error: Collection test_collection does not exist.. Creating new collection.
Created new collection 'test_collection'


In [6]:
# Initialize retrieval system
retrieval_system = ChromaRetrievalSystem(
    vector_store=vector_store
)

In [7]:
question_generator = QuestionGenerator(
    retrieval_system=retrieval_system,
    use_local_llm=True,
    use_ollama=False
)

### Process Documents
---

In [8]:
def process_document(file_path):
    """Process a document and display results."""
    display(Markdown(f"### Processing: {os.path.basename(file_path)}"))
    
    start_time = time.time()
    chunks = document_processor.process_document(file_path)
    processing_time = time.time() - start_time
    
    display(Markdown(f"**Extracted {len(chunks)} chunks in {processing_time:.2f} seconds**"))
    
    # Display sample chunk info
    if chunks:
        display(Markdown("#### Sample chunk:"))
        display(Markdown(f"**Content (excerpt):**\n\n{chunks[0]['content'][:200]}..."))
        display(Markdown(f"**Embedding shape:** {chunks[0]['embedding'].shape}"))
        
        # Display metadata
        display(Markdown("**Metadata:**"))
        for key, value in chunks[0]['metadata'].items():
            display(Markdown(f"- {key}: {value}"))
    
    return chunks

In [9]:
document_paths = [
    "test_files/test.pdf",
    "test_files/test.pptx"
]

all_chunks = []

for doc_path in document_paths:
    if os.path.exists(doc_path):
        chunks = process_document(doc_path)
        if chunks:
            all_chunks.extend(chunks)
    else:
        display(Markdown(f"**Warning:** File not found: {doc_path}"))

display(Markdown(f"### Total chunks: {len(all_chunks)}"))

### Processing: test.pdf

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


**Extracted 1122 chunks in 19.88 seconds**

#### Sample chunk:

**Content (excerpt):**

Creativity & Innovation
AHL3300 – AHL5300
By Cristian Zaelzer Ph.D.
7 to 9:50 PM – 200 Wilbrod 104
...

**Embedding shape:** (384,)

**Metadata:**

- source: test.pdf

- chunk_id: 0

- topics: ['innovation diversity', 'inclusion dei', 'zaelzer ph', 'person assigned', 'root causes']

- page_number: 1

### Processing: test.pptx

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

**Extracted 1419 chunks in 15.46 seconds**

#### Sample chunk:

**Content (excerpt):**

CSI 5180: Virtual Assistants
Presentation 1: Speech Recognition

Group 13: Ovais Azeem (300112311) | Mico Ellerich Comia (300218564)
...

**Embedding shape:** (384,)

**Metadata:**

- source: test.pptx

- chunk_id: 0

- topics: ['speech ability', 'csi 5180', 'disease association', 'terminology term', 'filterbank https']

- page_number: 1

### Total chunks: 2541

### Store in Vector DB
---

In [10]:
if all_chunks:
    display(Markdown(f"### Adding {len(all_chunks)} document chunks to ChromaDB"))
    
    start_time = time.time()
    vector_store.add_documents(all_chunks)
    add_time = time.time() - start_time
    
    collection_size = vector_store.get_collection_size()
    display(Markdown(f"**Added all chunks in {add_time:.2f} seconds**"))
    display(Markdown(f"**ChromaDB collection size:** {collection_size} documents"))
    
    # Get available topics
    topics = vector_store.get_topics()
    display(Markdown(f"**Available topics:** {', '.join(topics)}"))
else:
    display(Markdown("**No chunks extracted. Check your document paths.**"))

### Adding 2541 document chunks to ChromaDB

Added 2541 documents to ChromaDB collection


**Added all chunks in 0.70 seconds**

**ChromaDB collection size:** 2541 documents

**Available topics:** csi 5180, disease association, filterbank https, inclusion dei, innovation diversity, person assigned, root causes, speech ability, terminology term, zaelzer ph

In [11]:
all_chunks

[{'content': 'Creativity & Innovation\nAHL3300 – AHL5300\nBy Cristian Zaelzer Ph.D.\n7 to 9:50 PM – 200 Wilbrod 104\n',
  'embedding': array([-9.53956693e-02,  3.99918072e-02, -6.58368766e-02, -6.64338097e-02,
         -7.17964098e-02, -6.42651343e-04,  5.26451990e-02, -1.72004327e-02,
          4.54280190e-02,  1.62602775e-02, -1.53229311e-02, -6.78617926e-03,
          7.37974867e-02, -6.22203872e-02, -4.45873439e-02,  1.72301307e-02,
         -9.61308274e-03, -4.72911075e-02, -1.08293463e-02, -9.65554565e-02,
         -1.76905468e-02, -2.64533553e-02,  7.59862736e-02,  1.01086881e-03,
          1.20010301e-02,  9.34622288e-02,  3.43770832e-02,  3.89150791e-02,
          1.01546897e-02, -6.81270808e-02,  2.71897130e-02,  9.39174667e-02,
          7.75138140e-02, -1.03847742e-01,  8.52706730e-02,  6.03158697e-02,
         -3.68625335e-02,  1.48357172e-02,  3.97826731e-02,  3.97400260e-02,
          9.09775961e-03, -2.60952301e-02, -2.90281642e-02,  3.05921007e-02,
         -2.45232973

### Test Search in Vector DB
---

In [12]:
test_queries = [
    "What is SCAMPER?",
    "What should leaders possess?",
    "What is LogMel?",
    "What is the project scope?"
]

# Function to display search results nicely
def display_search_results(query, results):
    display(Markdown(f"### Query: '{query}'"))
    
    if not results:
        display(Markdown("*No results found*"))
        return
        
    for i, result in enumerate(results):
        display(Markdown(f"#### Result {i+1} (Score: {result.get('similarity', 0):.4f})"))
        display(Markdown(f"**Source:** {result['metadata'].get('source', 'Unknown')}, " +
                          f"Page/Slide: {result['metadata'].get('page_number', 'Unknown')}"))
        display(Markdown(f"**Content:**\n\n{result['content'][:300]}..."))
        display(Markdown("---"))

In [13]:
# Run searches
for query in test_queries:
    start_time = time.time()
    results = retrieval_system.retrieve(query, top_k=3)
    search_time = time.time() - start_time
    
    display(Markdown(f"## Search Results ({search_time:.2f} seconds)"))
    display_search_results(query, results)

## Search Results (0.04 seconds)

### Query: 'What is SCAMPER?'

#### Result 1 (Score: 0.5056)

**Source:** test.pdf, Page/Slide: 11

**Content:**

AMPER model, originally a game aimed at fostering imagination in adolescents, forces your team 
to view a problem through seven filters: substitute, combine, adapt, modify, put to another use, eliminate, 
and reverse. The SCAMPER method is ideal when you start from an existing product to change or ...

---

#### Result 2 (Score: 0.4368)

**Source:** test.pdf, Page/Slide: 11

**Content:**

 curb the problem before it can reoccur by asking the question “Why?” over and over 
until it can no longer be answered. Once you reach this stage, you have arrived at the root cause of the 
issue. Check Miro's 5 Whys templates.
SCAMPER model
The SCAMPER model, originally a game aimed at fostering ...

---

#### Result 3 (Score: 0.4350)

**Source:** test.pdf, Page/Slide: 14

**Content:**

 will need to research current solutions to their problem.
4.
Open the SCAMPER model template in Miro to work with your 
teammates.
5.
Use the following 30 minutes using these methods to brainstorm.
Activity
Structured ideation 
processes prevent chaos.
...

---

## Search Results (0.01 seconds)

### Query: 'What should leaders possess?'

#### Result 1 (Score: 0.5548)

**Source:** test.pdf, Page/Slide: 7

**Content:**

Leaders should model 
curiosity and openness.
A good leader effectively 
communicates their vision, 
actively listens to their team, 
fosters strong relationships, 
empowers individuals, takes 
accountability for outcomes, 
demonstrates integrity, and 
inspires others to achieve goals 
by leading ...

---

#### Result 2 (Score: 0.5418)

**Source:** test.pdf, Page/Slide: 8

**Content:**


A Good 
Leader’s 
Actions
•Accountability:
Takes responsibility for both 
successes and failures.
•Visionary Leadership:
Sets a clear direction and 
inspires the team to achieve 
long-term goals.
•Adaptability:
Responds effectively to 
changing circumstances and 
challenges.
•Positive ...

---

#### Result 3 (Score: 0.5359)

**Source:** test.pdf, Page/Slide: 9

**Content:**

1.
Professor will allocate individuals in each group as leaders based 
on their participation in previous classes.
2.
. Each person assigned as a leader will evaluate 
herself/himself/themselves using the previous guidance on 
desirable leadership features and follow those guidelines as 
behaviour ...

---

## Search Results (0.01 seconds)

### Query: 'What is LogMel?'

#### Result 1 (Score: 0.4253)

**Source:** test.pptx, Page/Slide: 2

**Content:**

logy
First Term
Second Term
...

---

#### Result 2 (Score: 0.4253)

**Source:** test.pptx, Page/Slide: 5

**Content:**

logy
First Term
Second Term
...

---

#### Result 3 (Score: 0.4253)

**Source:** test.pptx, Page/Slide: 9

**Content:**

logy
First Term
Second Term
...

---

## Search Results (0.01 seconds)

### Query: 'What is the project scope?'

#### Result 1 (Score: 0.5101)

**Source:** test.pdf, Page/Slide: 16

**Content:**

S.M.A.R.T. Goals
Example of a SMART project goal:
When setting project goals, consider:
Communication:
Clearly communicate goals to all team 
members and stakeholders to ensure 
alignment and understanding.
Flexibility:
Be prepared to adjust goals based on changing 
circumstances or new ...

---

#### Result 2 (Score: 0.4793)

**Source:** test.pdf, Page/Slide: 21

**Content:**

Deliver a prototype for:
Project 1. A device that captures colour from real life and makes it 
available for digital applications.
Project 2. An anti-thief system to avoid the robbery of cars in Ontario.
Project 3. An easy-to-use app to help you replace imported products 
with Canadian-made ...

---

#### Result 3 (Score: 0.4739)

**Source:** test.pdf, Page/Slide: 17

**Content:**

1.
Set your S.M.A.R.T. Goals for the project.
2.
You can use a template here in, S.M.A.R.T. Goals Template. 
Activity
S.M.A.R.T. Goals
...

---

### Get Topics
---

In [14]:
topics = vector_store.get_topics()
topics

['csi 5180',
 'disease association',
 'filterbank https',
 'inclusion dei',
 'innovation diversity',
 'person assigned',
 'root causes',
 'speech ability',
 'terminology term',
 'zaelzer ph']

In [15]:
if topics and len(topics) > 0:
    display(Markdown("## Topic Filtering Test"))
    test_topic = topics[0]
    display(Markdown(f"**Filtering by topic:** {test_topic}"))
    
    query = "verification"  # Generic query
    filtered_results = retrieval_system.retrieve(
        query,
        top_k=3,
        filter_topics=[test_topic]
    )
    
    display_search_results(query, filtered_results)
    
    # Verify topic filtering worked - check if topic is in the comma-separated string
    if filtered_results:
        all_correct = True
        for result in filtered_results:
            # Get topics as a list from the comma-separated string
            result_topics = [t.strip() for t in result['metadata'].get('topics', '').split(',')]
            if test_topic not in result_topics:
                all_correct = False
                break
                
        if all_correct:
            display(Markdown("✅ **Topic filtering verified:** All results contain the selected topic"))
        else:
            display(Markdown("❌ **Topic filtering issue:** Some results don't contain the selected topic"))
    else:
        display(Markdown("❌ **No results found:** The topic filtering may be too restrictive"))
else:
    display(Markdown("*No topics available for filtering test*"))

## Topic Filtering Test

**Filtering by topic:** csi 5180

### Query: 'verification'

#### Result 1 (Score: 0.4479)

**Source:** test.pptx, Page/Slide: 18

**Content:**

ta
...

---

#### Result 2 (Score: 0.4389)

**Source:** test.pptx, Page/Slide: 27

**Content:**

 a number of signs ]
...

---

#### Result 3 (Score: 0.4389)

**Source:** test.pptx, Page/Slide: 27

**Content:**

a number of signs ]
...

---

✅ **Topic filtering verified:** All results contain the selected topic

### Question Generation
---

In [21]:
# Test the question generation
question_data = question_generator.generate_question(topics=topics, 
                                                     question_type="multiple-choice", 
                                                     difficulty="medium")

# Display the generated question
print("Generated Question:", question_data.get("question"))
print("Options:", question_data.get("options"))
print("Answer:", question_data.get("answer"))
print("Explanation:", question_data.get("explanation"))

Device set to use mps
Question validation failed on attempt 1: Multiple-choice answer must be one of A, B, C, D
Device set to use mps
Question validation failed on attempt 2: Multiple-choice answer must be one of A, B, C, D
Device set to use mps
Question validation failed on attempt 3: Multiple-choice answer must be one of A, B, C, D


Generated Question: What is the difference between a filterbank and a bandpass filter?
Options: [{'text': 'Both filters have a cutoff frequency.', 'answer': 'C'}, {'text': 'The filterbank has more bands than a bandpass filter.', 'answer': 'D'}, {'text': 'Filterbanks can be used to model complex systems such as human speech.', 'answer': 'A'}, {'text': 'Both filters have a fixed number of bands.', 'answer': 'B'}]
Answer: None
Explanation: In both filterbanks and bandpass filters, the signal is filtered by a set of passband and stopband frequencies. However, in a filterbank, there are additional bands called “filters” that allow for more control over the filtering process. A filterbank has more bands than a bandpass filter because it allows for more complex filtering functions.


In [20]:
# Test the question generation
question_data = question_generator.generate_question(topics, 
                                                     question_type="free-text", 
                                                     difficulty="medium")

# Display the generated question
print("Generated Question:", question_data.get("question"))
print("Options:", question_data.get("options"))
print("Answer:", question_data.get("answer"))
print("Explanation:", question_data.get("explanation"))

Device set to use mps
Question validation failed on attempt 1: Free-text question needs a substantial model answer
Device set to use mps


Generated Question: What are some common symptoms of MND and how do they differ from those of ALS?
Options: None
Answer: MND is characterized by progressive weakness and wasting of muscles, while ALS is associated with progressive paralysis and difficulty speaking.
Explanation: None
