<a href="https://colab.research.google.com/github/maruf4461/Comparative-analysis-of-RAG-performance-on-Open-Source-LLM_openDB/blob/main/01_Environment_Setup_Complete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Complete RAG Research Implementation
# ==========================================

# CELL 1: GPU Check and Drive Mount

In [5]:
import torch
import os
import sys
from google.colab import drive
import subprocess

print("🔍  Checking GPU availability...")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name()
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"✅  GPU Available: {gpu_name}")
    print(f"📊  GPU Memory: {gpu_memory:.1f} GB")

    if gpu_memory < 15:
        print("⚠️  Warning: Less than 15GB GPU memory. Will use quantization for larger models.")
else:
    print("❌  No GPU available. Please enable GPU in Runtime > Change runtime type")
    print("   Go to Runtime > Change runtime type > Hardware accelerator > GPU")

# Mount Google Drive
print("\n📂  Mounting Google Drive...")
drive.mount('/content/drive')

# Create comprehensive project structure
project_dir = '/content/drive/MyDrive/RAG_Research_Complete'
directories = [
    'data/raw/msmarco',
    'data/raw/natural_questions',
    'data/raw/squad',
    'data/raw/hotpotqa',
    'data/processed/chunks',
    'data/processed/embeddings',
    'models/llama2_7b',
    'models/llama2_13b',
    'models/mistral_7b',
    'models/codellama_7b',
    'models/llama3_8b',
    'results/experiments',
    'results/analysis',
    'results/plots',
    'results/tables',
    'src/models',
    'src/evaluation',
    'src/data_processing',
    'configs',
    'logs',
    'checkpoints',
    'python_files'
]

for dir_path in directories:
    full_path = os.path.join(project_dir, dir_path)
    os.makedirs(full_path, exist_ok=True)

print("✅  Project structure created!")
print(f"📁  Project directory: {project_dir}")


🔍  Checking GPU availability...
✅  GPU Available: Tesla T4
📊  GPU Memory: 15.8 GB

📂  Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅  Project structure created!
📁  Project directory: /content/drive/MyDrive/RAG_Research_Complete


# CELL 2: Install All Required Dependencies

In [6]:
def install_package(package):
    """Install package with progress tracking"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
        return True
    except subprocess.CalledProcessError:
        return False

# Core ML packages
core_packages = [
    "transformers>=4.35.0",
    "accelerate>=0.24.0",
    "bitsandbytes>=0.41.0",
    "sentencepiece>=0.1.99",
    "torch>=2.0.0",
    "datasets>=2.14.0"
]

# RAG specific packages
rag_packages = [
    "sentence-transformers>=2.2.2",
    "chromadb>=0.4.15",
    "faiss-cpu>=1.7.4",
    "langchain>=0.0.330",
    "tiktoken>=0.5.1"
]

# Evaluation packages
eval_packages = [
    "rouge-score>=0.1.2",
    "bert-score>=0.3.13",
    "sacrebleu>=2.3.1",
    "nltk>=3.8.1",
    "spacy>=3.7.0"
]

# Data processing packages
data_packages = [
    "pandas>=2.0.0",
    "numpy>=1.24.0",
    "scipy>=1.10.0",
    "scikit-learn>=1.3.0",
    "tqdm>=4.65.0"
]

# Visualization packages
viz_packages = [
    "matplotlib>=3.7.0",
    "seaborn>=0.12.0",
    "plotly>=5.17.0",
    "kaleido>=0.2.1"
]

# Statistics packages
stats_packages = [
    "statsmodels>=0.14.0",
    "pingouin>=0.5.3",
    "scipy>=1.10.0"
]

# Utility packages
util_packages = [
    "python-dotenv>=1.0.0",
    "wandb>=0.16.0",
    "huggingface_hub>=0.17.0",
    "psutil>=5.9.0",
    "requests>=2.31.0"
]

all_packages = core_packages + rag_packages + eval_packages + data_packages + viz_packages + stats_packages + util_packages

print("🔧  Installing all required packages...")
print("=" * 60)

failed_packages = []
for i, package in enumerate(all_packages):
    print(f"📦  [{i+1}/{len(all_packages)}] Installing {package.split('>=')[0]}...")
    if not install_package(package):
        failed_packages.append(package)
        print(f"❌  Failed: {package}")
    else:
        print(f"✅  Success: {package.split('>=')[0]}")

print("\n" + "=" * 60)
if failed_packages:
    print(f"❌  Failed packages ({len(failed_packages)}):")
    for pkg in failed_packages:
        print(f"   - {pkg}")
else:
    print("✅  All packages installed successfully!")

# Download required NLTK data
import nltk
print("\n📚  Downloading NLTK data...")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
print("✅  NLTK data downloaded")


🔧  Installing all required packages...
📦  [1/33] Installing transformers...
✅  Success: transformers
📦  [2/33] Installing accelerate...
✅  Success: accelerate
📦  [3/33] Installing bitsandbytes...
✅  Success: bitsandbytes
📦  [4/33] Installing sentencepiece...
✅  Success: sentencepiece
📦  [5/33] Installing torch...
✅  Success: torch
📦  [6/33] Installing datasets...
✅  Success: datasets
📦  [7/33] Installing sentence-transformers...
✅  Success: sentence-transformers
📦  [8/33] Installing chromadb...
✅  Success: chromadb
📦  [9/33] Installing faiss-cpu...
✅  Success: faiss-cpu
📦  [10/33] Installing langchain...
✅  Success: langchain
📦  [11/33] Installing tiktoken...
✅  Success: tiktoken
📦  [12/33] Installing rouge-score...
✅  Success: rouge-score
📦  [13/33] Installing bert-score...
✅  Success: bert-score
📦  [14/33] Installing sacrebleu...
✅  Success: sacrebleu
📦  [15/33] Installing nltk...
✅  Success: nltk
📦  [16/33] Installing spacy...
✅  Success: spacy
📦  [17/33] Installing pandas...
✅  Suc

# CELL 3: Create Utility Classes

In [7]:
import json
import pickle
import time
from datetime import datetime
from typing import Dict, List, Any, Optional
import pandas as pd

class ProjectUtils:
    """Comprehensive utility class for RAG research project"""

    def __init__(self, project_dir: str = '/content/drive/MyDrive/RAG_Research_Complete'):
        self.project_dir = project_dir
        self.logs = []

    def log(self, message: str, level: str = "INFO"):
        """Log messages with timestamp"""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_entry = f"[{timestamp}] {level}: {message}"
        self.logs.append(log_entry)
        print(log_entry)

        # Save to log file
        log_file = os.path.join(self.project_dir, 'logs', 'experiment.log')
        with open(log_file, 'a') as f:
            f.write(log_entry + '\n')

    def save_data(self, data: Any, filepath: str, format: str = 'json'):
        """Save data in various formats"""
        full_path = os.path.join(self.project_dir, filepath)
        os.makedirs(os.path.dirname(full_path), exist_ok=True)

        try:
            if format == 'json':
                with open(full_path, 'w') as f:
                    json.dump(data, f, indent=2, default=str)
            elif format == 'pickle':
                with open(full_path, 'wb') as f:
                    pickle.dump(data, f)
            elif format == 'csv':
                data.to_csv(full_path, index=False)
            elif format == 'parquet':
                data.to_parquet(full_path, index=False)

            self.log(f"Saved data to {filepath}")
            return True
        except Exception as e:
            self.log(f"Failed to save {filepath}: {e}", "ERROR")
            return False

    def load_data(self, filepath: str, format: str = 'json'):
        """Load data in various formats"""
        full_path = os.path.join(self.project_dir, filepath)

        try:
            if format == 'json':
                with open(full_path, 'r') as f:
                    return json.load(f)
            elif format == 'pickle':
                with open(full_path, 'rb') as f:
                    return pickle.load(f)
            elif format == 'csv':
                return pd.read_csv(full_path)
            elif format == 'parquet':
                return pd.read_parquet(full_path)
        except Exception as e:
            self.log(f"Failed to load {filepath}: {e}", "ERROR")
            return None

    def clear_gpu_memory(self):
        """Clear GPU memory"""
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            self.log("GPU memory cleared")

    def get_system_info(self):
        """Get comprehensive system information"""
        import psutil

        info = {
            'timestamp': datetime.now().isoformat(),
            'gpu_available': torch.cuda.is_available(),
            'gpu_count': torch.cuda.device_count() if torch.cuda.is_available() else 0,
            'cpu_count': psutil.cpu_count(),
            'memory_total_gb': psutil.virtual_memory().total / 1e9,
            'memory_available_gb': psutil.virtual_memory().available / 1e9,
            'disk_free_gb': psutil.disk_usage('/content').free / 1e9
        }

        if torch.cuda.is_available():
            info['gpu_name'] = torch.cuda.get_device_name()
            info['gpu_memory_gb'] = torch.cuda.get_device_properties(0).total_memory / 1e9
            info['gpu_memory_allocated_gb'] = torch.cuda.memory_allocated() / 1e9

        return info

    def save_checkpoint(self, data: Dict[str, Any], name: str):
        """Save experiment checkpoint"""
        timestamp = int(time.time())
        checkpoint_data = {
            'timestamp': timestamp,
            'datetime': datetime.now().isoformat(),
            'system_info': self.get_system_info(),
            'data': data
        }

        filepath = f"checkpoints/{name}_{timestamp}.json"
        return self.save_data(checkpoint_data, filepath, 'json')

    def load_latest_checkpoint(self, name_pattern: str):
        """Load most recent checkpoint matching pattern"""
        checkpoint_dir = os.path.join(self.project_dir, 'checkpoints')
        if not os.path.exists(checkpoint_dir):
            return None

        files = [f for f in os.listdir(checkpoint_dir)
                if name_pattern in f and f.endswith('.json')]

        if not files:
            return None

        # Sort by timestamp in filename
        latest_file = max(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))
        return self.load_data(f"checkpoints/{latest_file}", 'json')

# Initialize utils
utils = ProjectUtils()
utils.log("Project utilities initialized")

[2025-06-24 10:12:17] INFO: Project utilities initialized


# CELL 4: Test Setup and Save Configuration

In [9]:
# Test the setup
system_info = utils.get_system_info()
utils.log("System information collected")

# Save system configuration
config = {
    'project_name': 'RAG_Research_Complete',
    'setup_timestamp': datetime.now().isoformat(),
    'system_info': system_info,
    'models_to_evaluate': [
        'meta-llama/Llama-2-7b-chat-hf',
        'meta-llama/Llama-2-13b-chat-hf',
        'mistralai/Mistral-7B-Instruct-v0.1',
        'codellama/CodeLlama-7b-Instruct-hf',
        'meta-llama/Meta-Llama-3-8B-Instruct'
    ],
    'datasets': [
        'ms_marco',
        'natural_questions',
        'squad_v2',
        'hotpot_qa'
    ],
    'rag_configurations': [
        'basic_rag',
        'enhanced_rag',
        'optimized_rag'
    ],
    'evaluation_metrics': [
        'rouge_l',
        'bleu',
        'bert_score',
        'recall_at_k',
        'mrr',
        'ndcg'
    ]
}

utils.save_data(config, 'configs/project_config.json')

# Print setup summary
print("\n" + "="*80)
print("🎯  RAG RESEARCH PROJECT SETUP COMPLETE")
print("="*80)
print(f"📁  Project Directory: {utils.project_dir}")
print(f"🖥   GPU: {system_info.get('gpu_name', 'Not available')}")
print(f"💾  GPU Memory: {system_info.get('gpu_memory_gb', 0):.1f} GB")
print(f"🧠  RAM: {system_info['memory_total_gb']:.1f} GB")
print(f"💿  Disk Free: {system_info['disk_free_gb']:.1f} GB")
print(f"📊  Models to test: {len(config['models_to_evaluate'])}")
print(f"📚  Datasets: {len(config['datasets'])}")
print(f"⚙️   RAG configs: {len(config['rag_configurations'])}")
print(f"📈  Metrics: {len(config['evaluation_metrics'])}")
print("\n✅  Ready to proceed to Phase 2: Data Preparation")
print("="*80)

[2025-06-24 10:12:27] INFO: System information collected
[2025-06-24 10:12:27] INFO: Saved data to configs/project_config.json

🎯  RAG RESEARCH PROJECT SETUP COMPLETE
📁  Project Directory: /content/drive/MyDrive/RAG_Research_Complete
🖥   GPU: Tesla T4
💾  GPU Memory: 15.8 GB
🧠  RAM: 13.6 GB
💿  Disk Free: 75.6 GB
📊  Models to test: 5
📚  Datasets: 4
⚙️   RAG configs: 3
📈  Metrics: 6

✅  Ready to proceed to Phase 2: Data Preparation
