In [None]:
# Install open-source packages only
!pip install -q sentence-transformers chromadb pypdf langchain langchain-community faiss-cpu transformers torch numpy pandas

# Verify installations
import subprocess
result = subprocess.run(['pip', 'list'], capture_output=True, text=True)
print("Installed packages:")
for line in result.stdout.split('\n'):
    if any(pkg in line for pkg in ['sentence-transformers', 'chromadb', 'langchain', 'faiss']):
        print(f"  {line.strip()}")

In [None]:
# Import all required libraries
import os
import json
import time
import numpy as np
from typing import List, Dict, Any
from datetime import datetime

# PDF processing
from pypdf import PdfReader

# Vector stores and embeddings (all open-source)
import chromadb
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Document handling
from langchain.schema import Document

# Progress tracking
from tqdm.notebook import tqdm

print("All libraries imported successfully!")

In [None]:
# Create sample research paper content (simulating a real PDF)
sample_research_content = """
Title: Deep Learning for Natural Language Processing: A Comprehensive Survey

Abstract: This paper presents a comprehensive survey of deep learning techniques applied to natural language processing tasks. We examine the evolution from traditional statistical methods to modern neural architectures, focusing on transformer-based models and their applications.

1. Introduction
Natural Language Processing (NLP) has undergone a revolutionary transformation with the advent of deep learning. Traditional rule-based systems have given way to neural networks that learn patterns from vast amounts of text data.

2. Background: Traditional NLP Methods
Before deep learning, NLP relied heavily on:
- Bag-of-words models
- N-gram language models
- Hidden Markov Models (HMMs)
- Conditional Random Fields (CRFs)

3. Neural Network Foundations
Deep learning in NLP builds upon several key neural architectures:
- Recurrent Neural Networks (RNNs) for sequential data
- Long Short-Term Memory (LSTM) networks for long dependencies
- Convolutional Neural Networks (CNNs) for local patterns

4. Transformer Architecture
The transformer model, introduced in "Attention is All You Need," revolutionized NLP through:
- Self-attention mechanisms
- Parallel processing capabilities
- Scalability to large datasets

5. Large Language Models
Modern LLMs like BERT, GPT, and T5 demonstrate:
- Few-shot learning capabilities
- Transfer learning effectiveness
- Emergent behaviors at scale

6. Applications and Future Directions
Current applications include machine translation, question answering, and text generation. Future research focuses on efficiency, interpretability, and reducing computational requirements.
"""

# Save as a sample PDF file
with open('sample_research_paper.txt', 'w') as f:
    f.write(sample_research_content)

print("Sample research paper created!")
print("File: sample_research_paper.txt")
print(f"Size: {len(sample_research_content)} characters")

In [None]:
class PDFProcessor:
    """Handles PDF reading and text processing"""

    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,  # Each chunk ~500 characters
            chunk_overlap=50,  # Overlap to maintain context
            separators=["\n\n", "\n", ". ", "! ", "? "]  # Smart splitting
        )

    def load_pdf_content(self, file_path: str) -> str:
        """Load content from PDF or text file"""
        try:
            if file_path.endswith('.pdf'):
                reader = PdfReader(file_path)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() + "\n"
            else:
                with open(file_path, 'r') as f:
                    text = f.read()

            print(f"✅ Loaded {len(text)} characters from {file_path}")
            return text
        except Exception as e:
            print(f"Error loading file: {e}")
            return ""

    def create_documents(self, text: str, source_name: str) -> List[Document]:
        """Convert text into LangChain documents"""
        # Split text into chunks
        chunks = self.text_splitter.split_text(text)

        # Create documents with metadata
        documents = []
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "source": source_name,
                    "chunk_id": i,
                    "chunk_size": len(chunk),
                    "timestamp": datetime.now().isoformat()
                }
            )
            documents.append(doc)

        print(f"Created {len(documents)} document chunks")
        return documents

# Initialize processor
processor = PDFProcessor()

text=processor.load_pdf_content("/content/CNN.pdf")

# Process our sample research paper
documents = processor.create_documents(sample_research_content, "research_paper_survey")

# Show first few chunks
print("\n📋 First 3 chunks:")
for i, doc in enumerate(documents[:3]):
    print(f"\n--- Chunk {i+1} ---")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Size: {doc.metadata['chunk_size']} characters")

In [None]:
class EmbeddingManager:
    """Handles text embeddings using open-source models"""

    def __init__(self):
        # This is a small, fast, open-source model
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.embedding_dim = 384 # Size of embedding vectors

        print(f"Loaded embedding model: all-MiniLM-L6-v2")
        print(f"Embedding dimension: {self.embedding_dim}")

    def create_embeddings(self, texts: List[str]) -> np.ndarray:
        """Convert texts to embedding vectors"""
        print("Creating embeddings ...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Created {len(embeddings)} embeddings")
        return embeddings

# Initialize embedding manager
embed_manager = EmbeddingManager()

# Test with a simple example
test_texts = ["Machine learning is amazing", "Deep learning uses neural networks", "AI transforms the world"]
test_query = "neural networks"
embeddings = embed_manager.create_embeddings(test_texts)
print(f"Created {len(embeddings)} embeddings for test texts")

In [None]:
class VectorStoreManager:
    """Manages the in-memory vector database"""

    def __init__(self, embedding_manager: EmbeddingManager):
        self.embedding_manager = embedding_manager

        # Create in-memory Chroma client
        self.client = chromadb.Client()

        # Delete the collection if it exists
        try:
            self.client.delete_collection(name="research_papers")
            print("Deleted existing collection 'research_papers'")
        except:
            pass # Ignore if collection doesn't exist

        # Create or get collection
        self.collection = self.client.create_collection(
            name="research_papers",
            metadata={"description": "Academic paper chunks"}
        )

    def add_documents(self, documents: List[Document]):
        """Add documents to the vector store"""
        print("🔄 Adding documents to vector store...")
        # Prepare data
        texts = [doc.page_content for doc in documents]
        metadatas = [doc.metadata for doc in documents]
        ids = [f"doc_{i}" for i in range(len(documents))]
        # Create embeddings
        embeddings = self.embedding_manager.create_embeddings(texts)
        # Add to collection
        self.collection.add(
            embeddings=embeddings.tolist(),
            documents=texts,
            metadatas=metadatas,
            ids=ids
        )
        print(f"Added {len(documents)} documents to vector store")

    def search(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]:
        """Search for similar documents"""
        # Create query embedding
        query_embedding = self.embedding_manager.create_embeddings([query])[0]
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=n_results
        )
        # Format results
        formatted_results = []
        for i in range(len(results['documents'][0])):
            formatted_results.append({
                'content': results['documents'][0][i],
                'metadata': results['metadatas'][0][i],
                'distance': results['distances'][0][i]
            })
        return formatted_results

# Initialize vector store
vector_store = VectorStoreManager(embed_manager)
vector_store.add_documents(documents)

# Test search
search_results = vector_store.search("What is deep learning?", n_results=2)
print("\nSearch Results")

for i,result in enumerate(search_results,1):
    print(f"\n--- Result {i+1} ---")
    print(f"Content: {result['content'][:200]}")
    print(f"Distance: {result['distance']:.3f}")

In [None]:
from transformers import pipeline

class OpenSourceLLM:
    """Open-source language model for Q&A"""

    def __init__(self):
        # Using Flan-T5-small for educational purposes
        model_name = "google/flan-t5-small"
        print("Loading open-source language model...")
        self.qa_pipeline = pipeline(
            "text2text-generation",
            model=model_name,
            tokenizer=model_name,
        )
        print(f"Loaded model: {model_name}")

    def generate_answer(self, question: str, context: str) -> str:
        """Generate answer using context"""
        # Create prompt
        prompt = f"""Answer the question based on the context provided.
Context: {context}
Question: {question}
Answer:"""

        # Generate response
        response = self.qa_pipeline(prompt, max_length=200, do_sample=True)
        return response[0]['generated_text']

# Initialize LLM
llm = OpenSourceLLM()

# Test the model
test_context = "Deep learning is a subset of machine learning that uses neural networks with multiple layers."
test_question = "What is deep learning?"
test_answer = llm.generate_answer(test_question, test_context)
print(f"\n🔍 Test Answer: {test_answer}")

In [None]:
class ResearchAssistant:
    """Complete RAG system for research papers"""

    def __init__(self, vector_store: VectorStoreManager, llm: OpenSourceLLM):
        self.vector_store = vector_store
        self.llm = llm
        self.conversation_history = []

    def ask_question(self, question: str, n_contexts: int = 3) -> Dict[str, Any]:
        """Ask a question about the research paper"""
        print(f"Processing: {question}")

        start_time = time.time()

        # Step 1: Find relevant contexts
        relevant_docs = self.vector_store.search(question, n_results=n_contexts)

        # Step 2: Combine contexts
        combined_context = "\n\n".join([doc['content'] for doc in relevant_docs])

        # Step 3: Generate answer
        answer = self.llm.generate_answer(question, combined_context)

        processing_time = time.time() - start_time

        # Store conversation
        result = {
            "question": question,
            "answer": answer,
            "contexts_used": len(relevant_docs),
            "processing_time": round(processing_time, 2),
            "sources": [doc['metadata'] for doc in relevant_docs]
        }

        self.conversation_history.append(result)

        return result

# Initialize the research assistant
assistant = ResearchAssistant(vector_store, llm)

print("Research Assistant is ready!")
print("You can now ask questions about the research paper.")

In [None]:
# Educational questions about the research paper
educational_questions = [
    "What is the difference between traditional NLP and deep learning NLP?",
    "Can you explain what a transformer is in simple terms?",
    "What are the main applications of deep learning in NLP?",
    "How do neural networks help with language understanding?",
    "What comes before deep learning in NLP history?"
]

print("🎓 Educational Questions & Answers:")
print("=" * 50)

for question in educational_questions:
    print(f"\nQuestion: {question}")
    result = assistant.ask_question(question)
    print(f"Answer: {result['answer']}")
    print(f"Processing time: {result['processing_time']}s")
    print(f"Sources used: {result['contexts_used']} chunks")

In [None]:
print("🎉 Congratulations! You've built a complete GenAI system!")
print("\nWhat you learned:")
print("How to process PDF documents into searchable chunks")
print("Using open-source embedding models (no API keys!)")
print("Building in-memory vector databases with ChromaDB")
print("Creating Q&A systems with open-source language models")
print("Adding educational features for better learning")

print("\n Next steps to explore:")
print("1. Try with your own PDF research papers")
print("2. Experiment with different embedding models")
print("3. Add conversation memory for follow-up questions")
print("4. Create a web interface using Streamlit")
print("5. Try larger open-source models like Llama-2")

# Save conversation history for review
with open('learning_session.json', 'w') as f:
    json.dump(assistant.conversation_history, f, indent=2)

print("\nConversation history saved to 'learning_session.json'")