# CPU Inference on SageMaker

Run inference with the trained GL RL Model on CPU instances.

**Instance**: ml.t2.medium (CPU) - $0.05/hour
**Use Cases**: Testing, batch processing, API endpoints

## Setup

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

## Option 1: Load Pre-trained Model (No Fine-tuning)

In [None]:
# Load base Qwen model
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

print(f"Loading model: {model_name}")
print("This may take a few minutes on CPU...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    padding_side='left'
)

# Load model (use float32 for CPU)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float32,  # CPU doesn't support fp16
    device_map="cpu"
)

print("✅ Model loaded successfully!")
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters")

## Option 2: Load Fine-tuned Model from S3

In [None]:
def load_finetuned_model(s3_model_path=None, local_model_path=None):
    """Load fine-tuned model from S3 or local path"""
    
    if s3_model_path:
        print(f"Downloading model from S3: {s3_model_path}")
        !aws s3 cp {s3_model_path} model.tar.gz
        !tar -xzf model.tar.gz -C ./finetuned_model
        local_model_path = "./finetuned_model"
    
    if local_model_path:
        print(f"Loading fine-tuned model from: {local_model_path}")
        
        # Load base model
        base_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code=True,
            torch_dtype=torch.float32,
            device_map="cpu"
        )
        
        # Load LoRA weights
        model = PeftModel.from_pretrained(base_model, local_model_path)
        model = model.merge_and_unload()  # Merge LoRA weights
        
        print("✅ Fine-tuned model loaded!")
        return model
    
    return None

# Example: Load from S3 (replace with your model path)
# finetuned_model = load_finetuned_model(s3_model_path="s3://your-bucket/output/model.tar.gz")

## SQL Generation Function

In [None]:
def generate_sql(query, context="", max_length=200, temperature=0.7):
    """Generate SQL from natural language query"""
    
    # Format prompt
    prompt = f"""<|im_start|>system
You are a SQL expert. Generate SQL queries based on natural language questions.
Schema: {context}<|im_end|>
<|im_start|>user
{query}<|im_end|>
<|im_start|>assistant"""
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate
    start_time = time.time()
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generation_time = time.time() - start_time
    
    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract SQL (after assistant marker)
    if "assistant" in response:
        sql = response.split("assistant")[-1].strip()
    else:
        sql = response.strip()
    
    return sql, generation_time

print("✅ SQL generation function ready")

## Test Single Query

In [None]:
# Test queries
test_queries = [
    {
        "query": "Show me all customers",
        "context": "customers(id, name, email, created_at)"
    },
    {
        "query": "Get total sales by product category",
        "context": "products(id, name, category, price), sales(id, product_id, quantity, date)"
    },
    {
        "query": "Find employees who joined last month",
        "context": "employees(id, name, department, hire_date, salary)"
    }
]

# Generate SQL for each query
for i, test in enumerate(test_queries, 1):
    print(f"\n{'='*60}")
    print(f"Query {i}: {test['query']}")
    print(f"Schema: {test['context']}")
    print("-" * 60)
    
    sql, gen_time = generate_sql(test['query'], test['context'])
    
    print(f"Generated SQL: {sql}")
    print(f"Generation time: {gen_time:.2f} seconds")

## Batch Processing

In [None]:
def batch_process_queries(queries_file, output_file):
    """Process multiple queries from file"""
    
    results = []
    
    # Load queries
    with open(queries_file, 'r') as f:
        queries = [json.loads(line) for line in f]
    
    print(f"Processing {len(queries)} queries...")
    
    total_time = 0
    for i, item in enumerate(queries, 1):
        query = item.get('query', '')
        context = item.get('context', '')
        
        # Generate SQL
        sql, gen_time = generate_sql(query, context)
        total_time += gen_time
        
        # Save result
        result = {
            'query': query,
            'context': context,
            'generated_sql': sql,
            'generation_time': gen_time
        }
        results.append(result)
        
        if i % 10 == 0:
            print(f"Processed {i}/{len(queries)} queries...")
    
    # Save results
    with open(output_file, 'w') as f:
        for result in results:
            f.write(json.dumps(result) + '\n')
    
    print(f"\n✅ Batch processing complete!")
    print(f"Total queries: {len(queries)}")
    print(f"Total time: {total_time:.2f} seconds")
    print(f"Average time per query: {total_time/len(queries):.2f} seconds")
    print(f"Results saved to: {output_file}")

# Example usage
# batch_process_queries('data/training/query_pairs.jsonl', 'inference_results.jsonl')

## Deploy as SageMaker Endpoint (Optional)

In [None]:
# Create inference script
inference_code = '''
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import json

def model_fn(model_dir):
    """Load model for inference"""
    model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        torch_dtype=torch.float32
    )
    
    return {"model": model, "tokenizer": tokenizer}

def input_fn(request_body, content_type):
    """Parse input data"""
    if content_type == "application/json":
        input_data = json.loads(request_body)
        return input_data
    else:
        raise ValueError(f"Unsupported content type: {content_type}")

def predict_fn(input_data, model_dict):
    """Run inference"""
    model = model_dict["model"]
    tokenizer = model_dict["tokenizer"]
    
    query = input_data.get("query", "")
    context = input_data.get("context", "")
    
    prompt = f"""Generate SQL for: {query}
Schema: {context}
SQL:"""
    
    inputs = tokenizer(prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=200)
    
    sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return {"sql": sql}

def output_fn(prediction, content_type):
    """Format output"""
    if content_type == "application/json":
        return json.dumps(prediction)
    else:
        raise ValueError(f"Unsupported content type: {content_type}")
'''

# Save inference script
with open('inference.py', 'w') as f:
    f.write(inference_code)

print("✅ Created inference.py for SageMaker endpoint deployment")
print("To deploy as endpoint, use SageMaker Model and Endpoint configuration")

## Performance Optimization Tips

### For CPU Inference:

1. **Use quantization** to reduce model size and speed up inference:
```python
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
```

2. **Enable torch.compile** (PyTorch 2.0+):
```python
model = torch.compile(model)
```

3. **Batch processing** for multiple queries

4. **Cache model** in memory between requests

5. **Use smaller models** when possible (0.5B or 1B parameter models)

### Inference Speed on ml.t2.medium:
- Single query: ~2-5 seconds
- Batch of 10: ~15-30 seconds
- Throughput: ~20-30 queries/minute

### For Production:
Consider using:
- **ml.m5.xlarge**: Better CPU performance
- **ml.c5.xlarge**: Compute-optimized
- **SageMaker Endpoints**: Auto-scaling
- **AWS Lambda**: Serverless inference