# GSM8K Chain-of-Thought Generation (Colab Optimized)

This notebook generates chain-of-thought reasoning for GSM8K math problems using language models.
**Optimized for Google Colab with GPU acceleration.**

‚ö†Ô∏è **Important**: Make sure to enable GPU runtime in Colab:
- Runtime ‚Üí Change runtime type ‚Üí Hardware accelerator ‚Üí **GPU (T4)**

## 0. Colab Environment Setup (Run First!)

In [None]:
# Install required packages in Colab
!pip install -q transformers datasets accelerate bitsandbytes

# Import libraries
import os
import json
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm
import gc
import time
import psutil

print("üì¶ Packages installed successfully!")

## 1. Verify Colab GPU Setup

In [None]:
# Configuration
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
OUTPUT_DIR = "./cot_output"
MAX_PROBLEMS = 200

# Critical: Verify GPU is available
print("üîç Checking Colab environment...")
print(f"PyTorch version: {torch.__version__}")

if torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"‚úÖ GPU Available: {torch.cuda.get_device_name()}")
    print(f"   CUDA version: {torch.version.cuda}")
    print(f"   GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f"   Memory free: {torch.cuda.memory_reserved(0) / 1024**3:.1f} GB")
else:
    DEVICE = "cpu"
    print("‚ùå No GPU detected! This will be VERY slow (2+ minutes per sample)")
    print("   Go to Runtime ‚Üí Change runtime type ‚Üí Hardware accelerator ‚Üí GPU")

# Check system memory
ram_gb = psutil.virtual_memory().total / 1024**3
print(f"üíæ System RAM: {ram_gb:.1f} GB")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# HuggingFace token (optional for public models)
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
    print("‚úÖ HF_TOKEN found")
else:
    print("‚ÑπÔ∏è  No HF_TOKEN (OK for public models)")

## 2. Load and Explore GSM8K Dataset

In [None]:
# Load GSM8K dataset
print("üìä Loading GSM8K dataset...")
dataset = load_dataset("gsm8k", "main")
train_data = dataset["train"]

print(f"Total problems in GSM8K train: {len(train_data)}")
print(f"Will process: {min(MAX_PROBLEMS, len(train_data))} problems")

# Show sample problem
sample = train_data[0]
print("\n=== Sample Problem ===")
print(f"Question: {sample['question']}")
print(f"Answer: {sample['answer']}")

In [None]:
## 3. Load Model (Colab Optimized)

# Clear any existing cache
if DEVICE == "cuda":
    torch.cuda.empty_cache()
gc.collect()

print("üîß Loading tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
    
    # Fix tokenizer threading conflicts
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print("‚úÖ Set pad_token to eos_token")
    
    print(f"‚úÖ Tokenizer loaded: {tokenizer.__class__.__name__}")
    
except Exception as e:
    print(f"‚ùå Error loading tokenizer: {e}")
    raise

print("üöÄ Loading model with Colab optimizations...")
start_time = time.time()

try:
    if DEVICE == "cuda":
        # Colab GPU optimizations
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float16,      # Essential for GPU performance
            device_map="auto",              # Let transformers handle device placement
            load_in_8bit=True,              # Reduce memory usage (requires bitsandbytes)
            trust_remote_code=True,
            token=HF_TOKEN
        )
    else:
        # CPU fallback
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float32,
            token=HF_TOKEN
        ).to(DEVICE)
    
    model.eval()
    load_time = time.time() - start_time
    
    print(f"‚úÖ Model loaded in {load_time:.1f}s")
    print(f"Model device: {next(model.parameters()).device}")
    
    # Memory usage
    if DEVICE == "cuda":
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU memory allocated: {allocated:.2f} GB")
        print(f"GPU memory reserved: {reserved:.2f} GB")
        
        # Expected performance
        print("\n‚ö° Expected performance on GPU: 15-30 seconds per sample")
    else:
        print("\nüêå CPU mode: 2+ minutes per sample (very slow!)")

except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("\nTroubleshooting:")
    print("1. Make sure GPU runtime is enabled")
    print("2. Try restarting runtime if out of memory")
    print("3. Consider using a smaller model")
    raise

In [None]:
def generate_cot(problem, max_retries=3):
    """Generate chain-of-thought reasoning for a math problem."""
    
    prompt = f"""You are a math reasoning assistant. 
Solve the following problem using step-by-step chain-of-thought reasoning and give the final answer at the end.

Problem:
{problem}

Answer with:
<reasoning>
...step-by-step reasoning...
</reasoning>
<final>
...final numeric answer...
</final>
"""
    
    for attempt in range(max_retries):
        try:
            # Clear cache before each generation
            if DEVICE == "cuda":
                torch.cuda.empty_cache()
            gc.collect()
            
            # Tokenize input
            inputs = tokenizer(
                prompt, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=1024
            ).to(DEVICE)
            
            # Generate
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=400,
                    do_sample=True,
                    temperature=0.4,
                    top_p=0.9,
                    pad_token_id=tokenizer.eos_token_id
                )
            
            # Decode
            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            return text
            
        except Exception as e:
            print(f"‚ùå Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in 2 seconds...")
                time.sleep(2)
            else:
                print(f"All attempts failed for problem")
                raise e
    
    return None

print("‚úÖ CoT generation function defined")

def generate_cot(problem, max_retries=3):
    """Generate chain-of-thought reasoning with Colab optimizations."""
    
    prompt = f"""You are a math reasoning assistant. 
Solve the following problem using step-by-step chain-of-thought reasoning and give the final answer at the end.

Problem:
{problem}

Answer with:
<reasoning>
...step-by-step reasoning...
</reasoning>
<final>
...final numeric answer...
</final>
"""
    
    for attempt in range(max_retries):
        try:
            # Colab memory management
            if DEVICE == "cuda":
                torch.cuda.empty_cache()
            gc.collect()
            
            # Optimized tokenization
            inputs = tokenizer(
                prompt, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=1024
            )
            
            # Move to device (handle device_map="auto")
            if hasattr(model, 'device'):
                device = model.device
            else:
                device = next(model.parameters()).device
            
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Optimized generation for Colab
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=300,         # Reduced for speed
                    do_sample=True,
                    temperature=0.7,            # Higher for faster generation
                    top_p=0.9,
                    pad_token_id=tokenizer.eos_token_id,
                    use_cache=True,             # Enable KV cache
                    repetition_penalty=1.1      # Prevent repetition
                )
            
            # Decode result
            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Clean up tensors
            del inputs, outputs
            if DEVICE == "cuda":
                torch.cuda.empty_cache()
            
            return text
            
        except Exception as e:
            print(f"‚ùå Attempt {attempt + 1} failed: {e}")
            # Clean up on error
            if DEVICE == "cuda":
                torch.cuda.empty_cache()
            gc.collect()
            
            if attempt < max_retries - 1:
                print(f"Retrying in 2 seconds...")
                time.sleep(2)
            else:
                print(f"All attempts failed for problem")
                raise e
    
    return None

print("‚úÖ Colab-optimized CoT generation function defined")

In [None]:
# Test with first problem
print("Testing with sample problem...")
test_problem = train_data[0]['question']

print(f"Problem: {test_problem}")
print("\nGenerating CoT...")

start_time = time.time()
try:
    result = generate_cot(test_problem)
    end_time = time.time()
    
    print(f"‚úÖ Generation successful! Time: {end_time - start_time:.2f}s")
    print("\n=== Generated CoT ===")
    print(result)
    
    # Check memory usage
    if DEVICE == "cuda":
        print(f"\nGPU memory after generation: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    
except Exception as e:
    print(f"‚ùå Test failed: {e}")
    print("This is likely where your mutex error occurs.")

# Performance test with timing
print("üß™ Performance Test - Single Example")
test_problem = train_data[0]['question']

print(f"Problem: {test_problem}")
print(f"Expected answer: {train_data[0]['answer']}")
print("\n‚è±Ô∏è  Generating CoT...")

# Warm up GPU (first generation is always slower)
if DEVICE == "cuda":
    print("üî• GPU warmup...")
    _ = tokenizer("Test", return_tensors="pt")

start_time = time.time()
try:
    result = generate_cot(test_problem)
    end_time = time.time()
    generation_time = end_time - start_time
    
    print(f"‚úÖ Generation successful!")
    print(f"‚è±Ô∏è  Time: {generation_time:.1f} seconds")
    
    # Performance assessment
    if generation_time < 30:
        print("üöÄ Excellent! Good GPU performance")
    elif generation_time < 60:
        print("‚ö° Good performance")
    elif generation_time < 120:
        print("üêå Slow - check if GPU is being used")
    else:
        print("üö® Very slow! Likely running on CPU")
        print("   Check GPU runtime settings")
    
    print("\n=== Generated CoT (first 500 chars) ===")
    print(result[:500] + "..." if len(result) > 500 else result)
    
    # Memory info
    if DEVICE == "cuda":
        allocated = torch.cuda.memory_allocated() / 1024**3
        print(f"\nüíæ GPU memory after test: {allocated:.2f} GB")
        
        # Estimate time for full dataset
        total_time_hours = (generation_time * MAX_PROBLEMS) / 3600
        print(f"üìä Estimated time for {MAX_PROBLEMS} problems: {total_time_hours:.1f} hours")
    
except Exception as e:
    print(f"‚ùå Test failed: {e}")
    print("\nüîß Troubleshooting:")
    print("1. Restart Colab runtime")
    print("2. Verify GPU is enabled")
    print("3. Check if model is too large for available memory")

In [None]:
# Process problems in batches
results = []
errors = []

# Determine range to process
num_to_process = min(MAX_PROBLEMS, len(train_data))

print(f"Starting batch processing of {num_to_process} problems...")

for i in tqdm(range(num_to_process), desc="Generating CoTs"):
    item = train_data[i]
    problem_text = item['question']
    gold_answer = item['answer']
    
    # Check if already processed
    output_file = f"{OUTPUT_DIR}/gsm8k_{i}.json"
    if os.path.exists(output_file):
        print(f"Skipping {i} (already exists)")
        continue
    
    try:
        # Generate CoT
        generated_cot = generate_cot(problem_text)
        
        # Save individual result
        result = {
            "id": i,
            "problem": problem_text,
            "gold_answer": gold_answer,
            "generated_cot": generated_cot
        }
        
        # Save to file
        with open(output_file, "w") as f:
            json.dump(result, f, indent=2)
        
        results.append(result)
        
        # Print progress every 10 problems
        if (i + 1) % 10 == 0:
            print(f"Completed {i + 1}/{num_to_process} problems")
            if DEVICE == "cuda":
                print(f"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    
    except Exception as e:
        error_info = {
            "id": i,
            "problem": problem_text,
            "error": str(e)
        }
        errors.append(error_info)
        print(f"‚ùå Error on problem {i}: {e}")
        continue
    
    # Small delay to prevent overwhelming the system
    time.sleep(0.1)

print(f"\n‚úÖ Processing complete!")
print(f"Successful: {len(results)}")
print(f"Errors: {len(errors)}")

## 7. Analyze Results

In [None]:
# Load all generated results
all_results = []

# Collect all JSON files from output directory
for filename in os.listdir(OUTPUT_DIR):
    if filename.endswith('.json'):
        with open(os.path.join(OUTPUT_DIR, filename), 'r') as f:
            all_results.append(json.load(f))

print(f"Found {len(all_results)} generated CoTs")

if all_results:
    # Convert to DataFrame for analysis
    df = pd.DataFrame(all_results)
    
    print("\n=== Dataset Summary ===")
    print(f"Total problems: {len(df)}")
    
    # Analyze CoT lengths
    df['cot_length'] = df['generated_cot'].apply(len)
    print(f"Average CoT length: {df['cot_length'].mean():.0f} characters")
    print(f"Min CoT length: {df['cot_length'].min()}")
    print(f"Max CoT length: {df['cot_length'].max()}")
    
    # Show sample results
    print("\n=== Sample Results ===")
    for i in range(min(3, len(df))):
        row = df.iloc[i]
        print(f"\nProblem {row['id']}:")
        print(f"Question: {row['problem'][:100]}...")
        print(f"Generated CoT: {row['generated_cot'][:200]}...")
else:
    print("No results found. Check for errors in generation.")

## 8. Export for FCM Training

In [None]:
# Export to JSONL format for FCM training
if all_results:
    output_jsonl = "data_processed/gsm8k_cots_notebook.jsonl"
    os.makedirs("data_processed", exist_ok=True)
    
    with open(output_jsonl, 'w') as f:
        for result in all_results:
            f.write(json.dumps(result) + '\n')
    
    print(f"‚úÖ Exported {len(all_results)} results to {output_jsonl}")
    
    # Also create summary statistics
    summary = {
        "total_problems": len(all_results),
        "generation_model": MODEL_NAME,
        "device_used": DEVICE,
        "average_cot_length": df['cot_length'].mean() if 'df' in locals() else 0,
        "errors_encountered": len(errors) if 'errors' in locals() else 0
    }
    
    with open("data_processed/generation_summary.json", 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"‚úÖ Summary saved to data_processed/generation_summary.json")
else:
    print("No results to export")

## 9. Cleanup

In [None]:
# Clean up memory
if 'model' in locals():
    del model
if 'tokenizer' in locals():
    del tokenizer

if DEVICE == "cuda":
    torch.cuda.empty_cache()

gc.collect()

print("‚úÖ Memory cleanup complete")
if DEVICE == "cuda":
    print(f"GPU memory after cleanup: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

## Next Steps

After running this notebook successfully:

1. **Check Results**: Verify the generated CoTs in `data_processed/gsm8k_cots_notebook.jsonl`
2. **Data Pipeline**: Continue with answer extraction and faithfulness labeling
3. **FCM Training**: Use the generated data for Faithfulness Classification Model training

If you encounter the mutex error:
- Check which cell it occurs in
- Try running cells individually
- Consider using a smaller model or CPU-only mode
- Monitor memory usage throughout the process