In [None]:
# =============================================================================
# CELL 1: SETUP AND INSTALLATIONS
# =============================================================================

# Import libraries
import torch
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)
import pandas as pd
import json
from datasets import Dataset
import gradio as gr
import os
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import PyPDF2
import docx
from io import StringIO
import zipfile
import requests
from google.colab import files, drive
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
# =============================================================================
# CELL 2: COLAB-OPTIMIZED DATA PROCESSOR
# =============================================================================

class ColabDataProcessor:
    """Colab-optimized data processor with file upload support"""

    def __init__(self):
        self.processed_data = []
        self.raw_text = ""

    def upload_and_process_files(self):
        """Upload files through Colab interface"""
        print("📁 Upload your data structures book (PDF, DOCX, or TXT)")
        print("Supported formats: .pdf, .docx, .txt")

        uploaded = files.upload()

        all_text = ""
        for filename, content in uploaded.items():
            print(f"Processing: {filename}")

            if filename.endswith('.pdf'):
                text = self.extract_from_pdf(content)
            elif filename.endswith('.docx'):
                text = self.extract_from_docx(content)
            elif filename.endswith('.txt'):
                text = content.decode('utf-8')
            else:
                print(f"Unsupported file format: {filename}")
                continue

            all_text += text + "\n\n"

        self.raw_text = all_text
        return all_text

    def extract_from_pdf(self, pdf_content):
        """Extract text from PDF"""
        try:
            # Write content to temporary file
            with open('temp.pdf', 'wb') as f:
                f.write(pdf_content)

            # Extract text
            text = ""
            with open('temp.pdf', 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"

            # Clean up
            os.remove('temp.pdf')
            return text
        except Exception as e:
            print(f"Error extracting PDF: {e}")
            return ""

    def extract_from_docx(self, docx_content):
        """Extract text from DOCX"""
        try:
            # Write content to temporary file
            with open('temp.docx', 'wb') as f:
                f.write(docx_content)

            # Extract text
            doc = docx.Document('temp.docx')
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"

            # Clean up
            os.remove('temp.docx')
            return text
        except Exception as e:
            print(f"Error extracting DOCX: {e}")
            return ""

    def use_sample_data(self):
        """Use sample data structures content for testing"""
        sample_text = """
        Chapter 1: Arrays
        An array is a collection of elements stored in contiguous memory locations. Arrays provide constant-time access to elements using indices.

        Key characteristics of arrays:
        - Fixed size in most programming languages
        - Elements are of the same data type
        - Zero-based indexing
        - Random access capability

        Time complexities:
        - Access: O(1)
        - Search: O(n) for unsorted, O(log n) for sorted
        - Insertion: O(n)
        - Deletion: O(n)

        Chapter 2: Linked Lists
        A linked list is a linear data structure where elements are stored in nodes. Each node contains data and a reference to the next node.

        Types of linked lists:
        - Singly linked list
        - Doubly linked list
        - Circular linked list

        Advantages:
        - Dynamic size
        - Efficient insertion and deletion at the beginning
        - Memory efficient for sparse data

        Time complexities:
        - Access: O(n)
        - Search: O(n)
        - Insertion: O(1) at head, O(n) at arbitrary position
        - Deletion: O(1) at head, O(n) at arbitrary position

        Chapter 3: Stacks
        A stack is a Last-In-First-Out (LIFO) data structure. Elements are added and removed from the same end called the top.

        Basic operations:
        - Push: Add element to top
        - Pop: Remove element from top
        - Peek/Top: View top element without removing
        - isEmpty: Check if stack is empty

        Applications:
        - Function call management
        - Expression evaluation and syntax parsing
        - Undo operations in applications
        - Browser history navigation

        Chapter 4: Queues
        A queue is a First-In-First-Out (FIFO) data structure. Elements are added at the rear and removed from the front.

        Basic operations:
        - Enqueue: Add element to rear
        - Dequeue: Remove element from front
        - Front: View front element
        - Rear: View rear element

        Types:
        - Simple queue
        - Circular queue
        - Priority queue
        - Double-ended queue (deque)

        Chapter 5: Trees
        A tree is a hierarchical data structure consisting of nodes connected by edges. Each tree has a root node and subtrees.

        Binary Tree properties:
        - Each node has at most two children
        - Left and right subtrees are also binary trees
        - Height of tree affects performance

        Tree traversals:
        - Inorder: Left, Root, Right
        - Preorder: Root, Left, Right
        - Postorder: Left, Right, Root
        - Level order: Breadth-first traversal

        Chapter 6: Graphs
        A graph is a collection of vertices connected by edges. Graphs can represent networks, relationships, and connections.

        Types:
        - Directed vs Undirected
        - Weighted vs Unweighted
        - Cyclic vs Acyclic

        Representations:
        - Adjacency matrix
        - Adjacency list

        Common algorithms:
        - Depth-First Search (DFS)
        - Breadth-First Search (BFS)
        - Dijkstra's shortest path
        - Minimum spanning tree algorithms
        """

        self.raw_text = sample_text
        return sample_text

    def clean_and_preprocess(self, text):
        """Clean and preprocess text data"""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove special characters but keep important punctuation
        text = re.sub(r'[^\w\s.,!?;:()\[\]{}"\'`\-]', '', text)

        # Split into sentences
        sentences = sent_tokenize(text)

        # Filter out very short sentences
        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]

        return sentences

    def create_training_data(self, sentences):
        """Create training data from sentences"""
        training_data = []

        # Create conversational pairs
        for i in range(len(sentences) - 1):
            current_sentence = sentences[i]
            next_sentence = sentences[i + 1]

            # Skip if sentences are too long
            if len(current_sentence) > 300 or len(next_sentence) > 300:
                continue

            # Create question-answer style training data
            if any(keyword in current_sentence.lower() for keyword in
                   ['array', 'list', 'stack', 'queue', 'tree', 'graph', 'algorithm', 'complexity']):

                training_text = f"Human: Tell me about {current_sentence.lower()}\nAssistant: {next_sentence}"
                training_data.append(training_text)

        # Add direct Q&A pairs
        qa_patterns = [
            ("What is", "is a"),
            ("How does", "works by"),
            ("Explain", "can be explained as"),
            ("Define", "is defined as")
        ]

        for sentence in sentences:
            for question_start, answer_start in qa_patterns:
                if answer_start in sentence.lower():
                    # Extract key concept
                    words = sentence.split()
                    if len(words) > 3:
                        concept = " ".join(words[:3])
                        question = f"{question_start} {concept}?"
                        training_text = f"Human: {question}\nAssistant: {sentence}"
                        training_data.append(training_text)
                        break

        return training_data

In [None]:

# =============================================================================
# CELL 3: GPU-OPTIMIZED MODEL TRAINER
# =============================================================================

class ColabGPUTrainer:
    """GPU-optimized trainer for Colab T4"""

    def __init__(self, model_name="microsoft/DialoGPT-small"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.device = device

    def load_model(self):
        """Load model optimized for T4 GPU"""
        print(f"🚀 Loading {self.model_name} on {self.device}")

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Add padding token
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Load model with optimal settings for T4
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16 if self.device.type == 'cuda' else torch.float32,
            device_map="auto" if self.device.type == 'cuda' else None,
        )

        if self.device.type == 'cuda':
            self.model = self.model.to(self.device)

        print("✅ Model loaded successfully!")
        return True

    def prepare_dataset_gpu(self, training_data):
        """Prepare dataset optimized for GPU training"""
        print(f"📊 Preparing {len(training_data)} training examples...")

        # Tokenize all data
        def tokenize_function(examples):
            return self.tokenizer(
                examples,
                truncation=True,
                padding=True,
                max_length=256,  # Reduced for T4 memory
                return_tensors="pt"
            )

        # Convert to dataset
        df = pd.DataFrame({'text': training_data})
        dataset = Dataset.from_pandas(df)

        # Tokenize
        def tokenize_batch(examples):
            return self.tokenizer(
                examples['text'],
                truncation=True,
                padding=True,
                max_length=256
            )

        tokenized_dataset = dataset.map(
            tokenize_batch,
            batched=True,
            remove_columns=['text']
        )

        # Split train/validation
        train_dataset = tokenized_dataset.train_test_split(test_size=0.1)

        return train_dataset['train'], train_dataset['test']

    def train_model_gpu(self, train_dataset, eval_dataset, output_dir="./dsbot_model"):
        """Train model with T4 GPU optimization"""

        # Training arguments optimized for T4
        training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=2,  # Reduced for faster training
            per_device_train_batch_size=4,  # Optimal for T4
            per_device_eval_batch_size=4,
            gradient_accumulation_steps=2,
            warmup_steps=100,
            logging_steps=25,
            save_steps=250,
            eval_steps=250,
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            dataloader_pin_memory=True,
            fp16=True,  # Mixed precision for T4
            report_to=None,  # Disable wandb
            eval_strategy="steps" # Changed from evaluation_strategy to eval_strategy for newer transformers
        )

        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
        )

        print("🎯 Starting training...")
        print(f"📈 Training on {len(train_dataset)} examples")
        print(f"📊 Validating on {len(eval_dataset)} examples")

        # Train the model
        trainer.train()

        # Save the model
        trainer.save_model(output_dir)
        self.tokenizer.save_pretrained(output_dir)

        print(f"✅ Training completed! Model saved to {output_dir}")
        return output_dir


In [None]:
# =============================================================================
# CELL 4: COLAB CHATBOT INTERFACE
# =============================================================================

class DSBotColabInterface:
    """Colab-optimized chatbot interface using Gradio"""

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.conversation_history = []

        # Move model to GPU if available
        if self.device.type == 'cuda':
            self.model = self.model.to(self.device)

    def generate_response(self, user_input, history=None):
        """Generate response with conversation context"""
        try:
            # Format input with context
            if history and len(history) > 0:
                # Include last few exchanges for context
                context = ""
                for h in history[-3:]:  # Last 3 exchanges
                    context += f"Human: {h[0]}\nAssistant: {h[1]}\n"
                formatted_input = f"{context}Human: {user_input}\nAssistant:"
            else:
                formatted_input = f"Human: {user_input}\nAssistant:"

            # Tokenize
            inputs = self.tokenizer.encode(
                formatted_input,
                return_tensors='pt',
                max_length=400,
                truncation=True
            )

            if self.device.type == 'cuda':
                inputs = inputs.to(self.device)

            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=150,
                    num_return_sequences=1,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id,
                    top_p=0.9,
                    repetition_penalty=1.1
                )

            # Decode response
            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract only the new part
            if "Assistant:" in full_response:
                response = full_response.split("Assistant:")[-1].strip()
            else:
                response = full_response[len(formatted_input):].strip()

            # Clean up response
            response = response.replace("Human:", "").strip()

            # Limit response length
            if len(response) > 500:
                response = response[:500] + "..."

            return response if response else "I'm not sure how to respond to that. Could you ask about data structures or algorithms?"

        except Exception as e:
            return f"Sorry, I encountered an error: {str(e)}"

    def create_gradio_interface(self):
        """Create Gradio chat interface"""

        def chat_fn(message, history):
            """Chat function for Gradio"""
            if not message.strip():
                return "", history

            response = self.generate_response(message, history)
            history.append([message, response])
            return "", history

        def clear_fn():
            """Clear chat history"""
            return [], []

        def example_fn(example):
            """Handle example questions"""
            return example, []

        # Create interface
        with gr.Blocks(
            title="DSBot - Data Structures Chatbot",
            theme=gr.themes.Soft(),
            css="""
            .gradio-container {
                max-width: 800px !important;
                margin: auto !important;
            }
            .chat-message {
                padding: 10px !important;
                margin: 5px !important;
            }
            """
        ) as interface:

            gr.Markdown(
                """
                # 🤖 DSBot - Data Structures Chatbot

                Welcome to DSBot! I'm your AI assistant for data structures and algorithms.
                Ask me about arrays, linked lists, trees, graphs, sorting algorithms, and more!

                **🚀 Powered by T4 GPU | 🧠 Fine-tuned on Data Structures Content**
                """
            )

            chatbot = gr.Chatbot(
                height=400,
                show_label=False,
                container=True,
                bubble_full_width=False
            )

            with gr.Row():
                msg = gr.Textbox(
                    placeholder="Ask me about data structures, algorithms, or implementations...",
                    container=False,
                    scale=4
                )
                send_btn = gr.Button("Send", scale=1, variant="primary")

            with gr.Row():
                clear_btn = gr.Button("🗑️ Clear Chat", scale=1)

            # Example questions
            gr.Markdown("### 💡 Try these example questions:")
            example_questions = [
                "What is a binary search tree?",
                "How does quicksort work?",
                "Explain hash table collisions",
                "What's the time complexity of merge sort?",
                "How do you implement a stack?",
                "What's the difference between DFS and BFS?"
            ]

            with gr.Row():
                for i in range(0, len(example_questions), 2):
                    with gr.Column():
                        for j in range(2):
                            if i + j < len(example_questions):
                                example_btn = gr.Button(
                                    example_questions[i + j],
                                    size="sm",
                                    variant="secondary"
                                )
                                example_btn.click(
                                    lambda x=example_questions[i + j]: (x, []),
                                    outputs=[msg, chatbot]
                                )

            # Event handlers
            msg.submit(chat_fn, [msg, chatbot], [msg, chatbot])
            send_btn.click(chat_fn, [msg, chatbot], [msg, chatbot])
            clear_btn.click(clear_fn, outputs=[chatbot, msg])

            gr.Markdown(
                """
                ---
                **Tips:**
                - Ask specific questions about data structures or algorithms
                - Request code implementations or examples
                - Inquire about time/space complexity analysis
                - Get explanations of concepts from your textbook
                """
            )

        return interface

    def launch_interface(self, share=True):
        """Launch the Gradio interface"""
        interface = self.create_gradio_interface()
        print("🚀 Launching DSBot interface...")
        interface.launch(share=share, debug=False, height=600)

In [None]:
# =============================================================================
# CELL 5: MAIN APPLICATION CONTROLLER
# =============================================================================

class DSBotColabApp:
    """Main application controller for Colab"""

    def __init__(self):
        self.data_processor = ColabDataProcessor()
        self.trainer = ColabGPUTrainer()
        self.interface = None

    def quick_demo(self):
        """Quick demo with pre-trained model"""
        print("🚀 Setting up DSBot Demo...")

        # Load pre-trained model
        if not self.trainer.load_model():
            print("❌ Failed to load model!")
            return False

        # Create interface
        self.interface = DSBotColabInterface(
            self.trainer.model,
            self.trainer.tokenizer
        )

        # Launch
        self.interface.launch_interface()
        return True

    def full_training_pipeline(self, use_uploaded_data=True):
        """Complete training pipeline"""
        print("🎯 Starting DSBot Full Training Pipeline...")

        # Step 1: Get training data
        if use_uploaded_data:
            print("📁 Please upload your book files...")
            raw_text = self.data_processor.upload_and_process_files()
        else:
            print("📚 Using sample data structures content...")
            raw_text = self.data_processor.use_sample_data()

        if not raw_text:
            print("❌ No text data available!")
            return False

        print(f"📊 Extracted {len(raw_text)} characters of text")

        # Step 2: Preprocess data
        print("🔄 Preprocessing text...")
        sentences = self.data_processor.clean_and_preprocess(raw_text)
        training_data = self.data_processor.create_training_data(sentences)

        print(f"✅ Created {len(training_data)} training examples")

        if len(training_data) < 10:
            print("⚠️ Warning: Very few training examples. Consider using more text data.")

        # Step 3: Load and train model
        print("🤖 Loading base model...")
        if not self.trainer.load_model():
            print("❌ Failed to load base model!")
            return False

        print("📈 Preparing datasets...")
        train_dataset, eval_dataset = self.trainer.prepare_dataset_gpu(training_data)

        print("🎯 Training model...")
        model_path = self.trainer.train_model_gpu(train_dataset, eval_dataset)

        # Step 4: Create interface with trained model
        print("🚀 Setting up trained model interface...")
        self.interface = DSBotColabInterface(
            self.trainer.model,
            self.trainer.tokenizer
        )

        # Launch interface
        self.interface.launch_interface()

        print("✅ DSBot training and deployment completed!")
        return True


In [None]:
# =============================================================================
# CELL 6: USAGE EXAMPLES AND EXECUTION
# =============================================================================

def run_dsbot_demo():
    """Run quick demo"""
    app = DSBotColabApp()
    return app.quick_demo()

def run_dsbot_training(use_sample_data=False):
    """Run full training pipeline"""
    app = DSBotColabApp()
    return app.full_training_pipeline(use_uploaded_data=not use_sample_data)

In [None]:
# =============================================================================
# EXECUTION INSTRUCTIONS
# =============================================================================

print("=" * 60)
print("🤖 DSBot - Data Structures Chatbot for Google Colab")
print("=" * 60)
print()
print("Choose your option:")
print("1. Quick Demo (uses pre-trained model)")
print("2. Full Training (with your book upload)")
print("3. Training with Sample Data (for testing)")
print()
print("📋 Usage:")
print("• For quick demo: run_dsbot_demo()")
print("• For full training: run_dsbot_training()")
print("• For sample data training: run_dsbot_training(use_sample_data=True)")
print()
print("💡 Tips for T4 GPU:")
print("• Training typically takes 15-30 minutes")
print("• Use batch size 4 for optimal memory usage")
print("• Mixed precision (fp16) is enabled for speed")
print("• Model will be saved in ./dsbot_model/")
print()
print("🚀 Ready to start! Run one of the functions above.")