In [6]:
# This script should be run in a SageMaker Studio Notebook cell.
# FINAL VERSION: This script programmatically creates a robust training script
# using the industry-standard TRL library's SFTTrainer, which solves data collation issues.
# It also securely fetches the Hugging Face token and launches the training job.

import sagemaker
from sagemaker.huggingface import HuggingFace
import boto3
import os
import json
import time

# --- Step 1: Define the Training Script Content using TRL's SFTTrainer ---
# This is a more robust approach that handles data processing complexities automatically.

script_content = """
# Final Corrected Script for Fine-Tuning using TRL SFTTrainer
import argparse, os, torch
from datasets import load_dataset
from peft import LoraConfig
# *** THE FIX IS HERE: Added TrainingArguments to the import statement ***
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer # Using the Supervised Fine-tuning Trainer from TRL

def formatting_prompts_func(example):
    # This function now returns a list of formatted strings.
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Instruction: {example['instruction'][i]}\\n### Output: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

def main(args):
    hf_token = os.getenv("HF_TOKEN")

    # Load dataset
    raw_dataset = load_dataset("json", data_files=os.path.join(args.dataset_path, "instructions.jsonl"), split="train")

    # Model and tokenizer loading
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
    model = AutoModelForCausalLM.from_pretrained(args.model_id, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, token=hf_token)
    tokenizer = AutoTokenizer.from_pretrained(args.model_id, trust_remote_code=True, token=hf_token)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # LoRA configuration
    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")

    # SFTTrainer handles all the complexities of data collation, padding, and labels.
    trainer = SFTTrainer(
        model=model,
        train_dataset=raw_dataset,
        peft_config=lora_config,
        formatting_func=formatting_prompts_func, # Pass our formatting function
        max_seq_length=512,
        tokenizer=tokenizer,
        args=TrainingArguments(
            output_dir=args.output_dir,
            per_device_train_batch_size=args.per_device_train_batch_size,
            gradient_accumulation_steps=4,
            learning_rate=args.learning_rate,
            num_train_epochs=args.epochs,
            logging_steps=10,
            save_strategy="epoch",
            fp16=True,
        ),
    )

    print("Starting fine-tuning with SFTTrainer...")
    trainer.train()
    print("🎉 Fine-tuning complete!")
    
    print(f"Saving final LoRA adapters to {args.output_dir}")
    trainer.save_model(args.output_dir)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_id", type=str)
    parser.add_argument("--dataset_path", type=str)
    parser.add_argument("--epochs", type=int)
    parser.add_argument("--per_device_train_batch_size", type=int)
    parser.add_argument("--learning_rate", type=float)
    parser.add_argument("--output_dir", type=str, default="/opt/ml/model")
    args, _ = parser.parse_known_args()
    main(args)
"""

# --- Step 2: Define Dependencies for the new script ---
requirements_content = """
peft
bitsandbytes
accelerate
trl
"""

print("--- Step 3: Creating/Verifying Training Script and Dependencies ---")
os.makedirs('src', exist_ok=True)
with open('src/train_lora.py', 'w', encoding='utf-8') as f:
    f.write(script_content)
with open('src/requirements.txt', 'w', encoding='utf-8') as f:
    f.write(requirements_content)
print("✅ Fresh 'train_lora.py' (using TRL) and 'requirements.txt' created successfully.")


print("\n--- Step 4: Setting up Session and AWS Clients ---")
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
role_name = role.split('/')[-1]
secrets_client = boto3.client('secretsmanager')
print(f"✅ Session established. Using role: {role_name}")

print("\n--- Step 5: Securely Fetching Hugging Face Token ---")
secret_name = "huggingface-access-token"
hf_token = ""
try:
    response = secrets_client.get_secret_value(SecretId=secret_name)
    hf_token = json.loads(response['SecretString'])['hf_token']
    print("✅ Successfully fetched Hugging Face token from Secrets Manager.")
except Exception as e:
    print(f"❌ Could not fetch token. This is likely an IAM permission issue.")
    print(f"   MANUAL ACTION REQUIRED: Please go to the IAM console, find the role '{role_name}',")
    print(f"   and attach an inline policy that allows 'secretsmanager:GetSecretValue' on the secret '{secret_name}'.")
    print(f"   Error: {e}")


print("\n--- Step 6: Defining Buckets and Hyperparameters ---")
input_bucket = 'edenred-invoice-data-ab-20250817'
output_bucket = 'edenred-llm-artifacts-ab-20250817'
training_data_uri = f's3://{input_bucket}/instructions/'

hyperparameters = {
    'model_id': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
    'dataset_path': '/opt/ml/input/data/training',
    'epochs': 1,
    'per_device_train_batch_size': 1,
    'learning_rate': 2e-4,
}
print(f"✅ Hyperparameters defined. Using model: {hyperparameters['model_id']}")

print("\n--- Step 7: Configuring the SageMaker Training Job ---")
environment = {
    'HF_TOKEN': hf_token
}

huggingface_estimator = HuggingFace(
    entry_point='train_lora.py',
    source_dir='./src',
    instance_type='ml.g5.2xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.36',
    pytorch_version='2.1',
    py_version='py310',
    hyperparameters=hyperparameters,
    environment=environment,
    output_path=f's3://{output_bucket}/',
)
print("✅ SageMaker Estimator created.")

print("\n--- Step 8: Launching the Training Job ---")
if hf_token:
    huggingface_estimator.fit({'training': training_data_uri})
    print("\n--- 🎉 Training Job Submitted Successfully! ---")
else:
    print("\n--- 🛑 Training job not started due to missing token. Please fix IAM permissions and re-run. ---")


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


--- Step 3: Creating/Verifying Training Script and Dependencies ---
✅ Fresh 'train_lora.py' (using TRL) and 'requirements.txt' created successfully.

--- Step 4: Setting up Session and AWS Clients ---
✅ Session established. Using role: AmazonSageMaker-ExecutionRole-20250817T092438

--- Step 5: Securely Fetching Hugging Face Token ---
✅ Successfully fetched Hugging Face token from Secrets Manager.

--- Step 6: Defining Buckets and Hyperparameters ---
✅ Hyperparameters defined. Using model: TinyLlama/TinyLlama-1.1B-Chat-v1.0

--- Step 7: Configuring the SageMaker Training Job ---
✅ SageMaker Estimator created.

--- Step 8: Launching the Training Job ---


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2025-08-17-20-24-27-463


2025-08-17 20:24:28 Starting - Starting the training job
2025-08-17 20:24:28 Pending - Training job waiting for capacity......
2025-08-17 20:25:06 Pending - Preparing the instances for training...
2025-08-17 20:25:50 Downloading - Downloading the training image..................
2025-08-17 20:28:57 Training - Training image download completed. Training in progress...[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2025-08-17 20:29:17,166 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2025-08-17 20:29:17,184 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-08-17 20:29:17,194 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2025-08-17 20:29:17,195 sagemaker_pytorch_container.t

In [30]:
# Run this in your SageMaker notebook to fix the deployment

import os
import json
from sagemaker.huggingface import HuggingFaceModel
import sagemaker
import time

# Step 1: Create the code directory and inference script
os.makedirs('code', exist_ok=True)

# Step 2: Create a simple inference.py script
inference_code = '''
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def model_fn(model_dir):
    """Load the model and tokenizer"""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        model = AutoModelForCausalLM.from_pretrained(
            model_dir,
            torch_dtype=torch.float32,  # Use float32 for CPU compatibility
            device_map="auto" if torch.cuda.is_available() else None
        )
        return {"model": model, "tokenizer": tokenizer}
    except Exception as e:
        print(f"Error loading model: {e}")
        raise

def input_fn(request_body, request_content_type):
    """Parse input data"""
    if request_content_type == "application/json":
        input_data = json.loads(request_body)
        return input_data
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model_dict):
    """Generate prediction"""
    try:
        model = model_dict["model"]
        tokenizer = model_dict["tokenizer"]
        
        # Extract inputs and parameters
        inputs = input_data.get("inputs", "")
        parameters = input_data.get("parameters", {})
        
        # Default parameters
        max_new_tokens = parameters.get("max_new_tokens", 100)
        temperature = parameters.get("temperature", 0.7)
        do_sample = parameters.get("do_sample", True)
        
        # Tokenize input
        input_ids = tokenizer.encode(inputs, return_tensors="pt")
        
        # Generate response
        with torch.no_grad():
            output_ids = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=do_sample,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode output
        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        return {
            "generated_text": generated_text,
            "input_length": len(input_ids[0]),
            "output_length": len(output_ids[0])
        }
    
    except Exception as e:
        print(f"Error in prediction: {e}")
        return {"error": str(e), "generated_text": "Sorry, I encountered an error generating a response."}

def output_fn(prediction, accept):
    """Format the output"""
    if accept == "application/json":
        return json.dumps(prediction), accept
    else:
        raise ValueError(f"Unsupported accept type: {accept}")
'''

# Write the inference script
with open('code/inference.py', 'w') as f:
    f.write(inference_code)

print("✅ Created code/inference.py")

# Step 3: Create requirements.txt for the inference environment
requirements = '''
transformers==4.37.0
torch>=2.0.0
accelerate
'''

with open('code/requirements.txt', 'w') as f:
    f.write(requirements)

print("✅ Created code/requirements.txt")

# Step 4: Deploy the model with CPU instance (avoids GPU driver issues)
print("🚀 Creating HuggingFace model...")

huggingface_model = HuggingFaceModel(
    model_data="s3://edenred-llm-artifacts-ab-20250817/huggingface-pytorch-training-2025-08-17-20-24-27-463/output/model.tar.gz",
    role="arn:aws:iam::453553127570:role/service-role/AmazonSageMaker-ExecutionRole-20250817T092438",
    transformers_version="4.37.0",
    pytorch_version="2.1.0",
    py_version="py310",
    entry_point="inference.py",
    source_dir="code"  # Now this directory exists!
)

print("🚀 Deploying with CPU instance...")

# Generate unique endpoint name
endpoint_name = f'huggingface-cpu-{int(time.time())}'

try:
    llm_predictor = huggingface_model.deploy(
        initial_instance_count=1,
        instance_type='ml.m5.xlarge',  # CPU instance - no GPU driver issues
        endpoint_name=endpoint_name,
        wait=False  # Don't wait for deployment to complete
    )
    
    print("✅ Deployment initiated!")
    print(f"🎯 Endpoint Name: {endpoint_name}")
    print("⏳ Deployment will take 5-10 minutes...")
    
    # Print the endpoint name to copy to Lambda
    print("\n" + "="*70)
    print("🔗 COPY THIS TO YOUR LAMBDA ENVIRONMENT VARIABLE:")
    print(f"SAGEMAKER_ENDPOINT_NAME = {endpoint_name}")
    print("="*70)
    
    # Test the endpoint (this will wait for deployment)
    print("\n🧪 Testing endpoint (waiting for deployment to complete)...")
    
    test_data = {
        "inputs": "Hello, how are you?",
        "parameters": {
            "max_new_tokens": 50,
            "temperature": 0.7,
            "do_sample": True
        }
    }
    
    try:
        response = llm_predictor.predict(test_data)
        print("✅ Test successful!")
        print(f"📄 Response: {response}")
    except Exception as e:
        print(f"⚠️ Test failed (endpoint might still be deploying): {e}")
        print("💡 Try testing again in a few minutes")

except Exception as e:
    print(f"❌ Deployment failed: {e}")
    print("💡 Try running the deployment again")

✅ Created code/inference.py
✅ Created code/requirements.txt
🚀 Creating HuggingFace model...
🚀 Deploying with CPU instance...


INFO:sagemaker:Repacking model artifact (s3://edenred-llm-artifacts-ab-20250817/huggingface-pytorch-training-2025-08-17-20-24-27-463/output/model.tar.gz), script artifact (code), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-453553127570/huggingface-pytorch-inference-2025-08-18-03-31-38-967/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: huggingface-pytorch-inference-2025-08-18-03-31-42-185
INFO:sagemaker:Creating endpoint-config with name huggingface-cpu-1755487898
INFO:sagemaker:Creating endpoint with name huggingface-cpu-1755487898


✅ Deployment initiated!
🎯 Endpoint Name: huggingface-cpu-1755487898
⏳ Deployment will take 5-10 minutes...

🔗 COPY THIS TO YOUR LAMBDA ENVIRONMENT VARIABLE:
SAGEMAKER_ENDPOINT_NAME = huggingface-cpu-1755487898

🧪 Testing endpoint (waiting for deployment to complete)...
⚠️ Test failed (endpoint might still be deploying): An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint huggingface-cpu-1755487898 of account 453553127570 not found.
💡 Try testing again in a few minutes
