Step 1: Installing Required Libraries

In [3]:
!pip install faiss-cpu requests pypdf2 sentence-transformers nltk numpy -q
!pip install --upgrade nltk -q

Step 2: Importing necessary libraries

In [4]:
import os
import numpy as np
from PyPDF2 import PdfReader
from google.colab import files
from sentence_transformers import SentenceTransformer
import faiss
import nltk

Step 3: Downloading and configuring NLTK resources

In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.data.path.append("/root/nltk_data")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Step 4: Initialize embedding model

In [6]:
EMBEDDING_MODEL = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Step 5: Defining PDF Processing Functions and Chunk Parameters

In [7]:
# =================  =================
def load_pdf(pdf_path):
    """Load and extract text from PDF files with error handling"""
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
        return text.strip()
    except Exception as e:
        print(f"Error reading {os.path.basename(pdf_path)}: {str(e)}")
        return ""

def process_pdfs(uploaded_files, chunk_size=1000, overlap=200):
    """Process uploaded PDFs into chunks with source tracking"""
    documents = []
    sources = []
    file_details = []

    for filename, content in uploaded_files.items():
        text = load_pdf(filename)
        if not text:
            print(f"Skipped {filename} - no text extracted")
            continue

        # Create chunks with overlap
        chunks = []
        start = 0
        while start < len(text):
            end = start + chunk_size
            chunk = text[start:end].strip()
            if len(chunk) > 100:  # Skip small chunks
                chunks.append(chunk)
                sources.append(f"{filename}:chunk{len(chunks)}")
            start = end - overlap

        documents.extend(chunks)
        file_details.append(f"ðŸ“„ {filename} ({len(chunks)} chunks)")

    print("\n".join(file_details))
    return documents, sources

Step 6: Defining NLP functions to process queries

In [8]:
def format_response(query, context, max_sentences=5):
    """Convert raw context into natural language response"""
    # Clean and structure text
    clean_context = " ".join(context.split())
    sentences = sent_tokenize(clean_context)

    # Find most relevant sentences
    keywords = set(query.lower().split())
    scored_sentences = []

    for sent in sentences:
        score = sum(1 for word in keywords if word in sent.lower())
        if score > 0:
            scored_sentences.append((score, sent))

    # Sort by relevance and select top sentences
    scored_sentences.sort(reverse=True, key=lambda x: x[0])
    top_sentences = [s[1] for s in scored_sentences[:max_sentences]]

    # Build natural response
    response = f"Based on the syllabus documents, here's what I found about '{query}':\n\n"

    if top_sentences:
        response += "â€¢ " + "\nâ€¢ ".join(top_sentences)
    else:
        response += "While not explicitly mentioned, here's relevant information:\n"
        response += clean_context[:1000] + ("..." if len(clean_context) > 1000 else "")

    return response

Step 7: Defininf FAISS Operations for similar information

In [9]:
def create_faiss_index(embeddings):
    """Create and populate FAISS index"""
    dim = EMBEDDING_MODEL.get_sentence_embedding_dimension()
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings).astype('float32'))
    return index

def retrieve_context(query, index, documents, sources, top_k=5):
    """Retrieve relevant context from documents"""
    query_embedding = EMBEDDING_MODEL.encode([query])
    distances, indices = index.search(query_embedding.astype('float32'), top_k)

    results = []
    for idx in indices[0]:
        if idx < len(documents):
            results.append({
                "content": documents[idx],
                "source": sources[idx]
            })
    return results

Step 8: Main Function & Querying

In [None]:
def main():
    print("="*60)
    print("ðŸ“š Syllabus Assistant: Ask questions about your course documents!")
    print("="*60)

    # File upload
    print("\nðŸ‘‰ Please upload your syllabus PDF files:")
    uploaded = files.upload()
    if not uploaded:
        print("No files uploaded. Exiting.")
        return

    # Process documents
    documents, sources = process_pdfs(uploaded)
    if not documents:
        print("No valid documents processed. Exiting.")
        return

    # Generate embeddings
    print("\nðŸ”® Generating document embeddings...")
    embeddings = EMBEDDING_MODEL.encode(documents, show_progress_bar=True)

    # Create FAISS index
    print("\nðŸ“‚ Creating search index...")
    index = create_faiss_index(embeddings)

    # Q&A Interface
    print("\n" + "="*60)
    print("ðŸ’¬ Ask questions about your curriculum (type 'exit' to quit)")
    print("="*60)

    while True:
        query = input("\nQuestion: ").strip()
        if query.lower() in ('exit', 'quit'):
            break

        # Retrieve relevant context
        results = retrieve_context(query, index, documents, sources)
        if not results:
            print("No relevant information found.")
            continue

        # Format natural response
        combined_context = "\n".join([f"[From {res['source']}]\n{res['content']}" for res in results])
        answer = format_response(query, combined_context)

        # Display results
        print("\n" + "="*60)
        print(answer)
        print("\nðŸ“š Sources:")
        print("\n".join(set(res['source'] for res in results)))
        print("="*60)

    print("\nThank you for using the Syllabus Assistant! ðŸŽ“")

# Run the application
if __name__ == "__main__":
    main()

ðŸ“š Syllabus Assistant: Ask questions about your course documents!

ðŸ‘‰ Please upload your syllabus PDF files:


Saving CHY1005_INTRODUCTION-TO-COMPUTATIONAL-CHEMISTRY_LTP_1.0_1_CHY1005-Introduction to Computational Chemistry-LTP.pdf to CHY1005_INTRODUCTION-TO-COMPUTATIONAL-CHEMISTRY_LTP_1.0_1_CHY1005-Introduction to Computational Chemistry-LTP (2).pdf
Saving HUM1004_IT-AND-SOCIETY_LT_1.0_1_Information Technology and Society.pdf to HUM1004_IT-AND-SOCIETY_LT_1.0_1_Information Technology and Society (2).pdf
Saving HUM1012_LOGIC-AND-LANGUAGE-STRUCTURE_LT_1.0_1_HUM1012_Logic and Language Structure-LT.pdf to HUM1012_LOGIC-AND-LANGUAGE-STRUCTURE_LT_1.0_1_HUM1012_Logic and Language Structure-LT (2).pdf
ðŸ“„ CHY1005_INTRODUCTION-TO-COMPUTATIONAL-CHEMISTRY_LTP_1.0_1_CHY1005-Introduction to Computational Chemistry-LTP (2).pdf (5 chunks)
ðŸ“„ HUM1004_IT-AND-SOCIETY_LT_1.0_1_Information Technology and Society (2).pdf (4 chunks)
ðŸ“„ HUM1012_LOGIC-AND-LANGUAGE-STRUCTURE_LT_1.0_1_HUM1012_Logic and Language Structure-LT (2).pdf (5 chunks)

ðŸ”® Generating document embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


ðŸ“‚ Creating search index...

ðŸ’¬ Ask questions about your syllabi (type 'exit' to quit)

Question: Course name of HUM1012

Based on the syllabus documents, here's what I found about 'Course name of HUM1012':

â€¢ [From HUM1012_LOGIC-AND-LANGUAGE-STRUCTURE_LT_1.0_1_HUM1012_Logic and Language Structure-LT (2).pdf:chunk1] Course Code HUM 1012 Logic and Language Structure Course Type LT Credits 3 Course Objectives: ï‚· Equip the students with symbolic language , which would further help them in coding sentential information.
â€¢ ï‚· Understanding and knowledge acquired form the course will enable students to take up further courses based on natural language processing in term of machine.
â€¢ Module Description Hours SO 1 Introduction to N atural Language Processing : Human language, models, ambiguity, processing paradigms, p [From HUM1004_IT-AND-SOCIETY_LT_1.0_1_Information Technology and Society (2).pdf:chunk1] Course Code: HUM1004 Information Technology and Society LT 3 Objectives To