<a href="https://colab.research.google.com/github/maruf4461/Rag_primary/blob/main/02_Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

02_Data_Preparation.ipynb

# CELL 1: Setup and Imports

In [None]:
from google.colab import drive
import sys
import os

# Mount drive and setup paths
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/RAG_Research/src')

from colab_utils import ColabUtils
utils = ColabUtils()

# Check runtime
utils.get_runtime_info()

Mounted at /content/drive
🖥️  Runtime Info:
   GPU Memory: 0.0/0.0 GB
   RAM: 1.2/13.6 GB
💾 Available disk space: 70.19 GB
   Disk: 70.2 GB free


{'gpu_total': 0,
 'gpu_used': 0,
 'ram_total': 13.608370176,
 'ram_used': 1.17116928}

# CELL 2: Download Datasets

In [None]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

def download_and_prepare_datasets():
    """Download and prepare datasets for RAG evaluation"""

    print("📥 Downloading datasets...")

    # 1. Natural Questions (smaller subset for Colab)
    print("1. Natural Questions...")
    try:
        nq_dataset = load_dataset("natural_questions", split="train[:1000]")  # Small subset

        # Convert to simple format
        nq_data = []
        for item in tqdm(nq_dataset):
            if item['annotations']['yes_no_answer'][0] != -1:  # Has answer
                nq_data.append({
                    'question': item['question']['text'],
                    'context': item['document']['html'],  # We'll clean this
                    'answer': item['annotations']['short_answers'][0]['text'] if item['annotations']['short_answers'] else ""
                })

        nq_df = pd.DataFrame(nq_data[:500])  # Limit for Colab
        utils.save_to_drive(nq_df, "data/raw/natural_questions.csv")
        print(f"   ✅ Natural Questions: {len(nq_df)} samples")

    except Exception as e:
        print(f"   ❌ Error with Natural Questions: {e}")

    # 2. SQuAD 2.0 (more manageable)
    print("2. SQuAD 2.0...")
    try:
        squad_dataset = load_dataset("squad_v2", split="train[:1000]")

        squad_data = []
        for item in tqdm(squad_dataset):
            squad_data.append({
                'question': item['question'],
                'context': item['context'],
                'answer': item['answers']['text'][0] if item['answers']['text'] else ""
            })

        squad_df = pd.DataFrame(squad_data)
        utils.save_to_drive(squad_df, "data/raw/squad_v2.csv")
        print(f"   ✅ SQuAD 2.0: {len(squad_df)} samples")

    except Exception as e:
        print(f"   ❌ Error with SQuAD: {e}")

    # 3. Create a simple test dataset
    print("3. Creating test dataset...")
    test_data = [
        {
            'question': 'What is the capital of France?',
            'context': 'France is a country in Europe. Paris is the capital and largest city of France.',
            'answer': 'Paris'
        },
        {
            'question': 'Who wrote Romeo and Juliet?',
            'context': 'William Shakespeare was an English playwright and poet. He wrote many famous plays including Romeo and Juliet.',
            'answer': 'William Shakespeare'
        }
    ]

    test_df = pd.DataFrame(test_data)
    utils.save_to_drive(test_df, "data/raw/test_dataset.csv")
    print(f"   ✅ Test dataset: {len(test_df)} samples")

# Run the download
download_and_prepare_datasets()


📥 Downloading datasets...
1. Natural Questions...
   ❌ Error with Natural Questions: Invalid pattern: '**' can only be an entire path component
2. SQuAD 2.0...
   ❌ Error with SQuAD: Invalid pattern: '**' can only be an entire path component
3. Creating test dataset...
✅ Saved to: /content/drive/MyDrive/RAG_Research/data/raw/test_dataset.csv
   ✅ Test dataset: 2 samples


In [None]:
import pandas as pd
import requests
import json
from tqdm import tqdm
import os

def download_and_prepare_datasets():
    """Download and prepare datasets for RAG evaluation - Manual download approach"""

    print("📥 Downloading datasets...")

    # Method 1: Download SQuAD manually via Hugging Face API
    print("1. SQuAD dataset (manual download)...")
    try:
        # Download SQuAD 1.1 directly from the source
        url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"

        print("   Downloading SQuAD train data...")
        response = requests.get(url)
        squad_json = response.json()

        squad_data = []
        count = 0

        for article in tqdm(squad_json['data']):
            if count >= 1000:  # Limit for Colab
                break
            for paragraph in article['paragraphs']:
                context = paragraph['context']
                for qa in paragraph['qas']:
                    if count >= 1000:
                        break
                    answer = qa['answers'][0]['text'] if qa['answers'] else ""
                    squad_data.append({
                        'question': qa['question'],
                        'context': context,
                        'answer': answer
                    })
                    count += 1

        squad_df = pd.DataFrame(squad_data)
        utils.save_to_drive(squad_df, "data/raw/squad.csv")
        print(f"   ✅ SQuAD: {len(squad_df)} samples")

    except Exception as e:
        print(f"   ❌ Error downloading SQuAD: {e}")

    # Method 2: Create a larger synthetic dataset based on common knowledge
    print("2. Creating expanded knowledge dataset...")

    knowledge_data = [
        {
            'question': 'What is the capital of France?',
            'context': 'France is a country in Europe. Paris is the capital and largest city of France. It is located in the north-central part of the country and serves as the political, economic, and cultural center. Paris has a population of over 2 million people.',
            'answer': 'Paris'
        },
        {
            'question': 'Who wrote Romeo and Juliet?',
            'context': 'William Shakespeare was an English playwright and poet who lived from 1564 to 1616. He wrote many famous plays including Romeo and Juliet, Hamlet, Macbeth, and Othello during the Elizabethan era. Romeo and Juliet is a tragedy about young star-crossed lovers.',
            'answer': 'William Shakespeare'
        },
        {
            'question': 'What is machine learning?',
            'context': 'Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed. It uses algorithms to analyze data, identify patterns, and make predictions or decisions.',
            'answer': 'A subset of artificial intelligence that enables computers to learn from data'
        },
        {
            'question': 'What is the largest planet in our solar system?',
            'context': 'Jupiter is the largest planet in our solar system. It is a gas giant with a mass more than twice that of all other planets combined. Jupiter has a distinctive Great Red Spot, which is a giant storm, and has over 80 known moons.',
            'answer': 'Jupiter'
        },
        {
            'question': 'Who painted the Mona Lisa?',
            'context': 'Leonardo da Vinci was an Italian Renaissance artist, inventor, and scientist who lived from 1452 to 1519. He painted the famous Mona Lisa portrait between 1503 and 1519. The painting is now housed in the Louvre Museum in Paris and is considered one of the most famous artworks in the world.',
            'answer': 'Leonardo da Vinci'
        },
        {
            'question': 'What is photosynthesis?',
            'context': 'Photosynthesis is the process by which plants and other organisms convert light energy, usually from the sun, into chemical energy. This process occurs in chloroplasts and produces glucose and oxygen from carbon dioxide and water using chlorophyll.',
            'answer': 'The process by which plants convert light energy into chemical energy'
        },
        {
            'question': 'When did World War II end?',
            'context': 'World War II was a global conflict that lasted from 1939 to 1945. The war in Europe ended on May 8, 1945, when Germany surrendered to the Allied forces. The war in the Pacific ended on September 2, 1945, after Japan surrendered following the atomic bombings of Hiroshima and Nagasaki.',
            'answer': '1945'
        },
        {
            'question': 'What is DNA?',
            'context': 'DNA (Deoxyribonucleic acid) is a molecule that carries genetic information in living organisms. It consists of two strands forming a double helix structure and contains instructions for the development, functioning, and reproduction of all known life forms.',
            'answer': 'A molecule that carries genetic information'
        },
        {
            'question': 'What is the speed of light?',
            'context': 'The speed of light in a vacuum is approximately 299,792,458 meters per second, often rounded to 300,000 kilometers per second. This is a fundamental physical constant denoted by the letter c. Nothing with mass can travel faster than the speed of light.',
            'answer': '299,792,458 meters per second'
        },
        {
            'question': 'Who invented the telephone?',
            'context': 'Alexander Graham Bell is credited with inventing the telephone in 1876. Bell was a Scottish-American inventor and scientist who was working on improving the telegraph when he developed the first practical telephone. He received the first US patent for the telephone on March 7, 1876.',
            'answer': 'Alexander Graham Bell'
        },
        {
            'question': 'What is the chemical formula for water?',
            'context': 'Water is a chemical compound consisting of two hydrogen atoms and one oxygen atom. Its chemical formula is H2O. Water is essential for all known forms of life and covers about 71% of Earth\'s surface.',
            'answer': 'H2O'
        },
        {
            'question': 'What is gravity?',
            'context': 'Gravity is a fundamental force of nature that attracts objects with mass toward each other. On Earth, gravity gives weight to physical objects and causes them to fall toward the ground when dropped. The strength of gravity depends on the mass of the objects and the distance between them.',
            'answer': 'A fundamental force that attracts objects with mass toward each other'
        },
        {
            'question': 'Who was the first person to walk on the moon?',
            'context': 'Neil Armstrong was the first person to walk on the moon. He stepped onto the lunar surface on July 20, 1969, during the Apollo 11 mission. His famous words upon stepping onto the moon were "That\'s one small step for man, one giant leap for mankind."',
            'answer': 'Neil Armstrong'
        },
        {
            'question': 'What is the boiling point of water?',
            'context': 'The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at standard atmospheric pressure (1 atmosphere or 101.325 kPa). At this temperature, water changes from liquid to gas (steam). The boiling point can vary with altitude and pressure.',
            'answer': '100 degrees Celsius'
        },
        {
            'question': 'What is the smallest unit of matter?',
            'context': 'An atom is the smallest unit of ordinary matter that forms a chemical element. Atoms consist of a nucleus containing protons and neutrons, surrounded by electrons. Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms.',
            'answer': 'Atom'
        },
        {
            'question': 'Who wrote "To Kill a Mockingbird"?',
            'context': 'Harper Lee wrote the novel "To Kill a Mockingbird," which was published in 1960. The book won the Pulitzer Prize for Fiction in 1961 and became a classic of modern American literature. It deals with issues of racial injustice and moral growth.',
            'answer': 'Harper Lee'
        },
        {
            'question': 'What is the largest ocean on Earth?',
            'context': 'The Pacific Ocean is the largest ocean on Earth, covering about one-third of the planet\'s surface. It extends from the Arctic to the Southern Ocean and is bounded by Asia and Australia on the west and the Americas on the east.',
            'answer': 'Pacific Ocean'
        },
        {
            'question': 'What is artificial intelligence?',
            'context': 'Artificial Intelligence (AI) is the simulation of human intelligence in machines that are programmed to think and learn like humans. AI systems can perform tasks that typically require human intelligence, such as visual perception, speech recognition, decision-making, and language translation.',
            'answer': 'The simulation of human intelligence in machines'
        },
        {
            'question': 'What is the periodic table?',
            'context': 'The periodic table is a tabular arrangement of chemical elements, ordered by their atomic number (number of protons). Elements with similar properties are grouped together in columns called groups or families. It was first developed by Dmitri Mendeleev in 1869.',
            'answer': 'A tabular arrangement of chemical elements ordered by atomic number'
        },
        {
            'question': 'What causes seasons on Earth?',
            'context': 'Seasons on Earth are caused by the tilt of Earth\'s axis as it orbits the sun. The Earth\'s axis is tilted at about 23.5 degrees. When a hemisphere is tilted toward the sun, it experiences summer, and when tilted away, it experiences winter.',
            'answer': 'The tilt of Earth\'s axis as it orbits the sun'
        }
    ]

    knowledge_df = pd.DataFrame(knowledge_data)
    utils.save_to_drive(knowledge_df, "data/raw/knowledge_dataset.csv")
    print(f"   ✅ Knowledge dataset: {len(knowledge_df)} samples")

    # Method 3: Try alternative API approach for more data
    print("3. Attempting to get more QA data...")
    try:
        # You can add more datasets here by downloading from other sources
        # For now, we'll create more synthetic data

        additional_data = []

        # Science questions
        science_qa = [
            ("What is photosynthesis?", "Photosynthesis is the process used by plants to convert sunlight into energy.", "The process used by plants to convert sunlight into energy"),
            ("What is an ecosystem?", "An ecosystem is a community of living organisms interacting with their physical environment.", "A community of living organisms interacting with their environment"),
            ("What is evolution?", "Evolution is the process by which species change over time through natural selection.", "The process by which species change over time"),
        ]

        # History questions
        history_qa = [
            ("When did the American Civil War end?", "The American Civil War ended in 1865 when the Confederate forces surrendered.", "1865"),
            ("Who was the first President of the United States?", "George Washington was the first President of the United States, serving from 1789 to 1797.", "George Washington"),
            ("When did World War I begin?", "World War I began in 1914 and lasted until 1918.", "1914"),
        ]

        # Geography questions
        geography_qa = [
            ("What is the longest river in the world?", "The Nile River is generally considered the longest river in the world, flowing through northeastern Africa.", "The Nile River"),
            ("What is the highest mountain in the world?", "Mount Everest is the highest mountain in the world, located in the Himalayas.", "Mount Everest"),
            ("What is the largest country by area?", "Russia is the largest country in the world by land area, covering over 17 million square kilometers.", "Russia"),
        ]

        all_additional = science_qa + history_qa + geography_qa

        for question, context, answer in all_additional:
            additional_data.append({
                'question': question,
                'context': context,
                'answer': answer
            })

        if additional_data:
            additional_df = pd.DataFrame(additional_data)
            utils.save_to_drive(additional_df, "data/raw/additional_qa.csv")
            print(f"   ✅ Additional QA dataset: {len(additional_df)} samples")

    except Exception as e:
        print(f"   ❌ Error creating additional data: {e}")

    print("\n🎯 Dataset preparation complete!")
    print("Available datasets:")
    print("- knowledge_dataset.csv: 20 comprehensive QA pairs")
    print("- additional_qa.csv: 9 topic-specific QA pairs")
    print("- squad.csv: Downloaded SQuAD data (if successful)")
    print("\nYou now have enough data to proceed with your RAG system!")

# Run the download
download_and_prepare_datasets()

📥 Downloading datasets...
1. SQuAD dataset (manual download)...
   Downloading SQuAD train data...


  0%|          | 2/442 [00:00<00:00, 1504.95it/s]

✅ Saved to: /content/drive/MyDrive/RAG_Research/data/raw/squad.csv
   ✅ SQuAD: 1000 samples
2. Creating expanded knowledge dataset...
✅ Saved to: /content/drive/MyDrive/RAG_Research/data/raw/knowledge_dataset.csv
   ✅ Knowledge dataset: 20 samples
3. Attempting to get more QA data...
✅ Saved to: /content/drive/MyDrive/RAG_Research/data/raw/additional_qa.csv
   ✅ Additional QA dataset: 9 samples

🎯 Dataset preparation complete!
Available datasets:
- knowledge_dataset.csv: 20 comprehensive QA pairs
- additional_qa.csv: 9 topic-specific QA pairs
- squad.csv: Downloaded SQuAD data (if successful)

You now have enough data to proceed with your RAG system!





# CELL 3: Text Chunking Functions

In [None]:
import re
from typing import List

class TextChunker:
    """Text chunking optimized for Colab memory constraints"""

    def __init__(self, chunk_size=512, overlap=50):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def clean_text(self, text: str) -> str:
        """Clean HTML and format text"""
        # Remove HTML tags
        text = re.sub(r'<[^>]+>', ' ', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters (keep basic punctuation)
        text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
        return text.strip()

    def chunk_text(self, text: str) -> List[str]:
        """Split text into overlapping chunks"""
        words = text.split()
        chunks = []

        for i in range(0, len(words), self.chunk_size - self.overlap):
            chunk_words = words[i:i + self.chunk_size]
            chunk = ' '.join(chunk_words)
            if len(chunk.strip()) > 50:  # Minimum chunk size
                chunks.append(chunk)

        return chunks

    def process_dataset(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process entire dataset with chunking"""
        processed_data = []

        for idx, row in tqdm(df.iterrows(), total=len(df)):
            clean_context = self.clean_text(row['context'])
            chunks = self.chunk_text(clean_context)

            for chunk_idx, chunk in enumerate(chunks):
                processed_data.append({
                    'original_id': idx,
                    'chunk_id': f"{idx}_{chunk_idx}",
                    'question': row['question'],
                    'chunk_text': chunk,
                    'answer': row['answer']
                })

        return pd.DataFrame(processed_data)

# Test chunking
chunker = TextChunker(chunk_size=256, overlap=25)

# Process test dataset
test_df = utils.load_from_drive("data/raw/test_dataset.csv")
processed_test = chunker.process_dataset(test_df)
utils.save_to_drive(processed_test, "data/processed/test_chunks.csv")

print(f"✅ Processed test dataset: {len(processed_test)} chunks from {len(test_df)} documents")


100%|██████████| 8/8 [00:00<00:00, 5129.08it/s]

✅ Saved to: /content/drive/MyDrive/RAG_Research/data/processed/test_chunks.csv
✅ Processed test dataset: 8 chunks from 8 documents





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


 CELL 4: Process All Datasets

In [None]:
# Process SQuAD (if available)
try:
    squad_df = utils.load_from_drive("data/raw/squad.csv")
    if squad_df is not None:
        print("Processing SQuAD dataset...")
        processed_squad = chunker.process_dataset(squad_df.head(200))  # Limit for memory
        utils.save_to_drive(processed_squad, "data/processed/squad_chunks.csv")
        print(f"✅ Processed SQuAD: {len(processed_squad)} chunks")
    else:
        print("⚠️ SQuAD dataset not found, skipping...")
except Exception as e:
    print(f"❌ Error processing SQuAD: {e}")

# Clear memory
utils.clear_gpu_memory()

print("\n🎉 Data preparation complete!")
print("📁 Available datasets:")
print("   - test_chunks.csv (small test set)")
print("   - squad_chunks.csv (if processed)")

Processing SQuAD dataset...


100%|██████████| 200/200 [00:00<00:00, 4498.11it/s]

✅ Saved to: /content/drive/MyDrive/RAG_Research/data/processed/squad_chunks.csv
✅ Processed SQuAD: 215 chunks

🎉 Data preparation complete!
📁 Available datasets:
   - test_chunks.csv (small test set)
   - squad_chunks.csv (if processed)



