In [None]:
# ===========================================
# COMPLETE SAGEMAKER WORKFLOW FOR IMDB SENTIMENT
# ===========================================

import os
import re
import glob
import json
import boto3
import pickle
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.parameters import ParameterString
from sagemaker.xgboost import XGBoost, XGBoostPredictor

# Initialize SageMaker
session = sagemaker.Session()
role = get_execution_role()
region = session.boto_region_name
bucket = session.default_bucket()
prefix = 'imdb-sentiment-analysis'

# ===========================================
# 1. DATA PREPARATION
# ===========================================

def download_imdb_data():
    """Download IMDb dataset if not already present"""
    data_dir = Path('../data/aclImdb')
    if not data_dir.exists():
        print("Downloading IMDb dataset...")
        # In production, we'd use a more reliable source
        os.system('wget -P ../data/ https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
        os.system('tar -xvzf ../data/aclImdb_v1.tar.gz -C ../data/')
    return str(data_dir)


# ===========================================
# 2. PROCESSING PIPELINE
# ===========================================

def create_processing_step():
    """Create SageMaker Processing Job"""
    # Use optimized sklearn container
    sklearn_processor = SKLearnProcessor(
        framework_version='1.2-1',
        role=role,
        instance_type='ml.m5.large',
        instance_count=1,
        base_job_name='imdb-processing'
    )
    
    return ProcessingStep(
        name='IMDBDataProcessing',
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(
                source=download_imdb_data(),
                destination='/opt/ml/processing/input/data'
            )
        ],
        outputs=[
            ProcessingOutput(
                output_name='train',
                source='/opt/ml/processing/output/train',
                destination=f's3://{bucket}/{prefix}/processed/train'
            ),
            ProcessingOutput(
                output_name='validation',
                source='/opt/ml/processing/output/validation',
                destination=f's3://{bucket}/{prefix}/processed/validation'
            ),
            ProcessingOutput(
                output_name='test',
                source='/opt/ml/processing/output/test',
                destination=f's3://{bucket}/{prefix}/processed/test'
            )
        ],
        code='processing_script.py'  # Contains our processing logic
    )


In [None]:
# ===========================================
# 3. MODEL TRAINING
# ===========================================

def create_training_step(train_s3, val_s3):
    """Create SageMaker Training Job"""
    # Get XGBoost container
    container = sagemaker.image_uris.retrieve('xgboost', region, '1.7-1')
    
    # Hyperparameters with validation
    hyperparameters = {
        'max_depth': '5',
        'eta': '0.2',
        'gamma': '4',
        'min_child_weight': '6',
        'subsample': '0.8',
        'objective': 'binary:logistic',
        'early_stopping_rounds': '10',
        'num_round': '100'
    }
    
    # Configure estimator
    xgb_estimator = XGBoost(
        entry_point='train.py',
        hyperparameters=hyperparameters,
        image_uri=container,
        role=role,
        instance_count=1,
        instance_type='ml.m5.2xlarge',
        framework_version='1.7-1',
        output_path=f's3://{bucket}/{prefix}/models',
        use_spot_instances=True,
        max_wait=7200,
        max_run=3600
    )
    
    return TrainingStep(
        name='IMDBModelTraining',
        estimator=xgb_estimator,
        inputs={
            'train': TrainingInput(s3_data=train_s3, content_type='text/csv'),
            'validation': TrainingInput(s3_data=val_s3, content_type='text/csv')
        }
    )


In [None]:
# ===========================================
# 4. DEPLOYMENT
# ===========================================

def deploy_model(model_data, endpoint_name):
    """Deploy trained model to endpoint"""
    model = sagemaker.model.Model(
        image_uri=sagemaker.image_uris.retrieve('xgboost', region, '1.7-1'),
        model_data=model_data,
        role=role,
        predictor_cls=XGBoostPredictor
    )
    
    # Production deployment configuration
    predictor = model.deploy(
        initial_instance_count=1,
        instance_type='ml.m5.large',
        endpoint_name=endpoint_name,
        data_capture_config=sagemaker.model_monitor.DataCaptureConfig(
            enable_capture=True,
            sampling_percentage=100,
            destination_s3_uri=f's3://{bucket}/{prefix}/monitoring'
        )
    )
    return predictor

In [None]:
# ===========================================
# 5. INFERENCE
# ===========================================

def run_batch_transform(test_s3, model_data):
    """Run batch transform on test data"""
    transformer = sagemaker.transformer.Transformer(
        model_data=model_data,
        instance_count=1,
        instance_type='ml.m5.xlarge',
        output_path=f's3://{bucket}/{prefix}/predictions',
        strategy='SingleRecord',
        assemble_with='Line',
        accept='text/csv'
    )
    
    transformer.transform(
        data=test_s3,
        content_type='text/csv',
        split_type='Line'
    )
    transformer.wait()

In [None]:
# ===========================================
# 6. PIPELINE ORCHESTRATION
# ===========================================

def main():
    """Execute end-to-end workflow"""
    # Create processing step
    processing_step = create_processing_step()
    
    # Create training step
    train_s3 = f's3://{bucket}/{prefix}/processed/train'
    val_s3 = f's3://{bucket}/{prefix}/processed/validation'
    training_step = create_training_step(train_s3, val_s3)
    
    # Define pipeline
    pipeline = Pipeline(
        name='IMDBSentimentPipeline',
        steps=[processing_step, training_step],
        parameters=[],
        sagemaker_session=session
    )
    
    # Execute pipeline
    pipeline.upsert(role_arn=role)
    execution = pipeline.start()
    execution.wait()
    
    # Get model data
    model_data = execution.steps[1].properties.ModelArtifacts.S3ModelArtifacts
    
    # Deploy model
    endpoint_name = 'imdb-sentiment-endpoint'
    deploy_model(model_data, endpoint_name)
    
    # Run batch transform
    test_s3 = f's3://{bucket}/{prefix}/processed/test'
    run_batch_transform(test_s3, model_data)
    
    print("✅ End-to-end workflow completed!")

if __name__ == "__main__":
    main()