# GL RL Model - SageMaker Setup Notebook

This notebook properly installs all dependencies using SageMaker best practices.

**Important**: Run this notebook using the `Python 3` kernel.

## 1. Upgrade pip to ensure we get prebuilt wheels

In [None]:
# Upgrade pip to ensure we get prebuilt wheels (need >= 19.0)
%pip install --upgrade pip

## 2. Install PyTorch (CPU version for ml.t2.medium)

In [None]:
# Install PyTorch CPU version
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

## 3. Install core packages with prebuilt wheels

In [None]:
# Install numpy and pandas first (they are dependencies for pyarrow)
%pip install numpy pandas

In [None]:
# Install pyarrow using prebuilt wheel
%pip install pyarrow

In [None]:
# Install Transformers ecosystem
%pip install transformers tokenizers huggingface-hub accelerate datasets

In [None]:
# Install fine-tuning libraries
%pip install peft trl

In [None]:
# Install additional utilities
%pip install sentencepiece protobuf tqdm fsspec aiohttp

## 4. Verify all installations

In [None]:
# Test all imports
import sys
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")
print("\n" + "="*50 + "\n")

import torch
import transformers
import datasets
import peft
import trl
import pyarrow
import pandas as pd
import numpy as np
import sentencepiece
import tokenizers
import accelerate

print("✅ All packages imported successfully!\n")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"Datasets: {datasets.__version__}")
print(f"PEFT: {peft.__version__}")
print(f"TRL: {trl.__version__}")
print(f"PyArrow: {pyarrow.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")
print(f"Sentencepiece: {sentencepiece.__version__}")
print(f"Tokenizers: {tokenizers.__version__}")
print(f"Accelerate: {accelerate.__version__}")
print(f"\nDevice: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

# Check if we're in SageMaker
import os
is_sagemaker = os.path.exists('/opt/ml/metadata/resource-metadata.json')
print(f"\nRunning in SageMaker: {is_sagemaker}")

## 5. Clone the repository and set up data

In [None]:
import os
import subprocess

# Navigate to SageMaker directory
sagemaker_dir = "/home/ec2-user/SageMaker"
if os.path.exists(sagemaker_dir):
    os.chdir(sagemaker_dir)
    print(f"Changed to directory: {os.getcwd()}")

# Clone or update the repository
repo_dir = "gl_rl_model"
if os.path.exists(repo_dir):
    print("Repository already exists, pulling latest changes...")
    os.chdir(repo_dir)
    !git pull origin main
else:
    print("Cloning repository...")
    !git clone https://github.com/maddinenisri/gl_rl_model.git
    os.chdir(repo_dir)

print(f"\nCurrent directory: {os.getcwd()}")

In [None]:
# Create training data directory and sample data
import json

os.makedirs("data/training", exist_ok=True)

# Sample training data
sample_data = [
    {"query": "Show me all customers", "sql": "SELECT * FROM customers;", "context": "customers(id, name, email, created_at)"},
    {"query": "Get total sales by month", "sql": "SELECT DATE_FORMAT(date, '%Y-%m') as month, SUM(amount) as total FROM sales GROUP BY month;", "context": "sales(id, date, amount, product_id)"},
    {"query": "Find top 5 products by revenue", "sql": "SELECT p.name, SUM(s.amount) as revenue FROM products p JOIN sales s ON p.id = s.product_id GROUP BY p.id ORDER BY revenue DESC LIMIT 5;", "context": "products(id, name, price), sales(id, product_id, amount)"},
    {"query": "List users who registered today", "sql": "SELECT * FROM users WHERE DATE(created_at) = CURDATE();", "context": "users(id, name, email, created_at)"},
    {"query": "Calculate average order value", "sql": "SELECT AVG(total_amount) as avg_order_value FROM orders;", "context": "orders(id, customer_id, total_amount, order_date)"}
]

# Write sample data
data_file = "data/training/query_pairs.jsonl"
if not os.path.exists(data_file):
    with open(data_file, 'w') as f:
        for item in sample_data:
            f.write(json.dumps(item) + '\n')
    print(f"✅ Created sample training data: {data_file}")
else:
    print(f"✅ Training data already exists: {data_file}")

# Load and display the data
with open(data_file, 'r') as f:
    loaded_data = [json.loads(line) for line in f]

print(f"\nLoaded {len(loaded_data)} training examples:")
for i, example in enumerate(loaded_data[:2], 1):
    print(f"\nExample {i}:")
    print(f"  Query: {example['query']}")
    print(f"  SQL: {example['sql']}")

## 6. Test tokenizer and model loading

In [None]:
from transformers import AutoTokenizer

model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
print(f"Testing tokenizer for {model_name}...\n")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    # Test tokenization
    test_text = "SELECT * FROM users WHERE age > 25;"
    tokens = tokenizer.encode(test_text)
    decoded = tokenizer.decode(tokens)
    
    print(f"✅ Tokenizer loaded successfully!")
    print(f"\nTest text: '{test_text}'")
    print(f"Token count: {len(tokens)}")
    print(f"Decoded: '{decoded}'")
    print(f"\nTokenizer vocabulary size: {tokenizer.vocab_size}")
    
except Exception as e:
    print(f"⚠️ Error loading tokenizer: {e}")
    print("This may be due to network issues. You can retry later.")

## 7. Environment Summary

In [None]:
import platform
import boto3
import json

print("="*60)
print("ENVIRONMENT SUMMARY")
print("="*60)

# System info
print(f"\n📊 System Information:")
print(f"  Platform: {platform.platform()}")
print(f"  Python: {platform.python_version()}")
print(f"  Processor: {platform.processor() or 'N/A'}")

# SageMaker info
if os.path.exists('/opt/ml/metadata/resource-metadata.json'):
    try:
        with open('/opt/ml/metadata/resource-metadata.json', 'r') as f:
            metadata = json.load(f)
        print(f"\n🚀 SageMaker Instance:")
        print(f"  Instance Type: {metadata.get('InstanceType', 'Unknown')}")
        print(f"  Region: {metadata.get('Region', 'Unknown')}")
    except:
        print("\n🚀 SageMaker: Running (details unavailable)")
else:
    print("\n💻 Running locally (not in SageMaker)")

# GPU info
print(f"\n🎮 GPU Status:")
if torch.cuda.is_available():
    print(f"  CUDA Available: Yes")
    print(f"  GPU Count: {torch.cuda.device_count()}")
    print(f"  GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print(f"  CUDA Available: No (CPU only)")
    print(f"  Note: This is expected on ml.t2.medium instances")
    print(f"  For GPU training, use SageMaker Training Jobs with ml.g5.xlarge")

print(f"\n✅ Setup Status: COMPLETE")
print("="*60)

print("\n📚 Next Steps:")
print("1. Open GL_RL_Model_Quick_Start.ipynb for the full training pipeline")
print("2. Use this notebook instance for development and testing")
print("3. For production training, launch GPU jobs with SageMaker Training")
print("\n💡 Tip: To save costs, remember to stop this notebook instance when not in use!")

## ✅ Setup Complete!

All dependencies have been installed using SageMaker best practices.

### Important Notes:

1. **Persistence**: Packages installed in this session will persist in `/home/ec2-user/SageMaker`
2. **Kernel**: Always use the `Python 3` kernel for consistency
3. **GPU Training**: Use SageMaker Training Jobs with `ml.g5.xlarge` spot instances
4. **Cost Optimization**: Stop the notebook instance when not in use

### To reinstall packages (if needed):

```python
%pip install <package_name>
```

### For conda packages (alternative):

```python
%conda install -c conda-forge <package_name>
```