In [1]:
#import data
import os
import glob
import boto3
from pathlib import Path
from datetime import datetime
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.xgboost import XGBoost, XGBoostPredictor


  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [9]:
# %mkdir data/preprocessed
 

# !wget -O data/aclImdb_v1.tar.gz https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -zxf data/aclImdb_v1.tar.gz -C data




In [5]:
def read_imdb_data(data_dir='data/aclImdb'):
    data = {}
    labels = {}
    
    # Full dataset is large; modify these limits as appropriate
    TRAIN_LIMIT = 10000
    TEST_LIMIT= 1000
    
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            print(f"PATH, {path}")
            files = glob.glob(path)
            
            file_index = 0
            for f in files:
                # if file_index > TRAINLIMIT and data_type = ‘train’: break
                # if file_index > TEST_LIMIT and data_type = ‘test’: break
                file_index += 1
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Here we represent a positive review by '1' and a negative review by '0'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)
                    
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)
                
    return data, labels

data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

PATH, data/aclImdb/train/pos/*.txt
PATH, data/aclImdb/train/neg/*.txt
PATH, data/aclImdb/test/pos/*.txt
PATH, data/aclImdb/test/neg/*.txt
IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [6]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
    """Prepare training and test sets from IMDb movie reviews."""
    
    #Combine positive and negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']
    
    #Shuffle reviews and corresponding labels within training and test sets
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)
    
    # Return a unified training data, test data, training labels, test labets
    return data_train, data_test, labels_train, labels_test

train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDb reviews (combined): train = 25000, test = 25000


In [10]:
import pandas as pd

data_dir='data/preprocessed'

# Save training data
pd.DataFrame({
    'review': train_X,
    'sentiment': train_y
}).to_csv(os.path.join(data_dir, 'train.csv'), index=False)

# Save testing data
pd.DataFrame({
    'review': test_X,
    'sentiment': test_y
}).to_csv(os.path.join(data_dir, 'test.csv'), index=False)

In [6]:
#Upload to s3

bucket = 'e2e-imdb-sagemaker-sentiment'
s3_prefix = 'imdb-data'  # folder name on S3

s3 = boto3.client('s3')

# s3.upload_file(os.path.join(data_dir, 'train.csv'), bucket, f'{s3_prefix}/train.csv')
# s3.upload_file(os.path.join(data_dir, 'test.csv'), bucket, f'{s3_prefix}/test.csv')
# s3.upload_file('scripts.zip', bucket, 'code/scripts.zip')
# s3.upload_file('scripts.zip', bucket,  'scripts/')
# s3.upload_file('requirements.txt', bucket, 'code/training/requirements.txt')



INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [2]:
def create_processing_step(role, config):
    """Create SageMaker Processing Job for pre-combined IMDb data"""
    sklearn_processor = SKLearnProcessor(
        framework_version='1.2-1',
        role=role,
        instance_type='ml.m5.large',
        instance_count=1,
        base_job_name='imdb-processing'
    )
    
    return ProcessingStep(
        name=config["processing_step_name"],
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(
                source=f"s3://{config['bucket']}/imdb-data/",  # Replace with your real path
                destination='/opt/ml/processing/input/data'
            )
        ],
        outputs=[
            ProcessingOutput(output_name='train', source='/opt/ml/processing/output/train'),
            ProcessingOutput(output_name='validation', source='/opt/ml/processing/output/validation'),
            ProcessingOutput(output_name='test', source='/opt/ml/processing/output/test'),
            ProcessingOutput(output_name='vectorizer', source='/opt/ml/processing/output/vectorizer')
        ],
        code='scripts/processing_job.py'
    )


In [3]:
def create_training_step(train_s3, val_s3, config):
    """Create SageMaker Training Job"""
    container = sagemaker.image_uris.retrieve('xgboost', config["region"], '1.7-1')
    
    hyperparameters = {
        'max_depth': '5',
        'eta': '0.2',
        'gamma': '4',
        'min_child_weight': '6',
        'subsample': '0.8',
        'objective': 'binary:logistic',
        'early_stopping_rounds': '10',
        'num_round': '100'
    }
    
    xgb_estimator = XGBoost(
        entry_point='train.py',
        source_dir='scripts',
        hyperparameters=hyperparameters,
        image_uri=container,
        role=config["role"],
        instance_count=1,
        instance_type='ml.m5.2xlarge',
        framework_version='1.7-1',
        output_path=f's3://{config["bucket"]}/{config["prefix"]}/models',
        use_spot_instances=True,
        max_wait=7200,
        max_run=3600,
        dependencies=['requirements.txt']  # Add dependencies file
    )
    
    return TrainingStep(
        name=config['training_step_name'],
        estimator=xgb_estimator,
        inputs={
            'train': TrainingInput(s3_data=train_s3, content_type='text/csv'),
            'validation': TrainingInput(s3_data=val_s3, content_type='text/csv'),
            'dependencies': TrainingInput(
                content_type='text/plain',
                s3_data=f's3://{config["bucket"]}/code/training/requirements.txt'
            )
        }
    )


In [4]:
from utils.test import test_endpoint
def main():
    # Initialize SageMaker
    session = sagemaker.Session()
    role = sagemaker.get_execution_role()
    region = session.boto_region_name
    bucket = "e2e-imdb-sagemaker-sentiment"
    prefix = 'imdb-sentiment-analysis'
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    training_step_name = f'IMDBModelTraining03_{timestamp}'
    processing_step_name = f'IMDBDataProcessing_{timestamp}'
    
    config = {
        "session": session,
        "role": role,
        "region": region,
        "bucket": bucket,
        "prefix": prefix,
        "training_step_name": training_step_name,
        "processing_step_name": processing_step_name
    }
    
    processing_step = create_processing_step(role, config)


    train_s3 = processing_step.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri
    val_s3 = processing_step.properties.ProcessingOutputConfig.Outputs['validation'].S3Output.S3Uri
    training_step = create_training_step(train_s3, val_s3,  config)
    
    
    pipeline = Pipeline(
        name='IMDBSentimentPipeline02',
        steps=[processing_step, training_step],
        sagemaker_session=session
    )
    
    pipeline.upsert(role_arn=role)
    execution = pipeline.start()
    execution.wait()

    print(f"-----")

    # Get step names
   
   
    training_job_name = None
    processing_job_name = None
    
    # Get job names from execution
    for step in execution.list_steps():
        if step['StepName'] == training_step_name:
            training_job_arn = step['Metadata']['TrainingJob']['Arn']
            training_job_name = training_job_arn.split('/')[-1]
        elif step['StepName'] == processing_step_name:
            processing_job_arn = step['Metadata']['ProcessingJob']['Arn']
            processing_job_name = processing_job_arn.split('/')[-1]
    
    if not training_job_name:
        raise ValueError(f"Training job name not found for step: {training_step_name}")
    if not processing_job_name:
        raise ValueError(f"Processing job name not found for step: {processing_step_name}")
    
    # Get model and vectorizer locations
    sm_client = boto3.client('sagemaker')
    
    # Model data
    training_job = sm_client.describe_training_job(TrainingJobName=training_job_name)
    model_data = training_job['ModelArtifacts']['S3ModelArtifacts']
    print(f"✅ Model data location: {model_data}")
    
    # Vectorizer
    processing_job = sm_client.describe_processing_job(ProcessingJobName=processing_job_name)
    vectorizer_s3 = None
    for output in processing_job['ProcessingOutputConfig']['Outputs']:
        if output['OutputName'] == 'vectorizer':
            vectorizer_s3 = output['S3Output']['S3Uri'] + '/vectorizer.joblib'
            break
    
    if not vectorizer_s3:
        raise ValueError("Vectorizer output not found")
    print(f"✅ Vectorizer location: {vectorizer_s3}")

    # Endpoint cleanup
    endpoint_name = 'imdb-sentiment-endpoint'
    
    
    # Delete endpoint if exists
    try:
        sm_client.delete_endpoint(EndpointName=endpoint_name)
        print(f"♻️ Deleted endpoint: {endpoint_name}")
        # Wait for deletion to complete
        time.sleep(60)
    except sm_client.exceptions.ResourceNotFound:
        print(f"ℹ️ Endpoint {endpoint_name} not found")
    
    # Delete endpoint configuration
    try:
        sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
        print(f"♻️ Deleted endpoint configuration: {endpoint_name}")
        time.sleep(10)
    except sm_client.exceptions.ResourceNotFound:
        print(f"ℹ️ Endpoint configuration {endpoint_name} not found")
    
    # Deploy model
    model = sagemaker.model.Model(
        image_uri=sagemaker.image_uris.retrieve('xgboost', region, '1.7-1'),
        model_data=model_data,
        role=role,
        predictor_cls=XGBoostPredictor  # Use our custom predictor
    )
    
    print("🚀 Deploying model...")
    predictor = model.deploy(
        initial_instance_count=1,
        instance_type='ml.m5.large',
        endpoint_name=endpoint_name,
        wait=True  # Wait until deployment completes
    )
    print(f"✅ Endpoint created: {endpoint_name}")
    
    # Test endpoint
    sample_review = "This movie was absolutely fantastic! The acting was superb."
    print("\n🧪 Testing endpoint...")
    result = test_endpoint(
        endpoint_name=endpoint_name,
        review_text=sample_review,
        vectorizer_path=vectorizer_s3
    )
    
    if 'error' in result:
        print(f"❌ Test failed: {result['error']}")
    else:
        print(f"🎯 Prediction: {result['prediction']:.4f} → {result['sentiment']} sentiment")
        print(f"Review: '{result['review']}'")
    
    print("✅ End-to-end workflow completed!")

# Run the pipeline
if __name__ == "__main__":
    main()

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


-----
✅ Model data location: s3://e2e-imdb-sagemaker-sentiment/imdb-sentiment-analysis/models/pipelines-tlk7sjh05yz9-IMDBModelTraining03--svBjOzLPVe/output/model.tar.gz
✅ Vectorizer location: s3://sagemaker-us-east-1-869935066996/IMDBSentimentPipeline02/tlk7sjh05yz9/IMDBDataProcessing_20250805_125856/output/vectorizer/vectorizer.joblib


In [15]:
# Initialize session and resources
from utils.test import test_endpoint
session = sagemaker.Session()
region = session.boto_region_name
role = get_execution_role()
sm_client = boto3.client('sagemaker', region_name=region)
 

model_data="s3://e2e-imdb-sagemaker-sentiment/imdb-sentiment-analysis/models/pipelines-tlk7sjh05yz9-IMDBModelTraining03--svBjOzLPVe/output/model.tar.gz"
vectorizer_s3 = "s3://sagemaker-us-east-1-869935066996/IMDBSentimentPipeline02/tlk7sjh05yz9/IMDBDataProcessing_20250805_125856/output/vectorizer/vectorizer.joblib"

endpoint_name = 'imdb-sentiment-endpoint'





# model = sagemaker.model.Model(
#     image_uri=sagemaker.image_uris.retrieve('xgboost', region, '1.7-1'),
#     model_data=model_data,
#     role=role,
#     predictor_cls=XGBoostPredictor  # Use our custom predictor
# )

# print("🚀 Deploying model...")

# predictor = model.deploy(
#     initial_instance_count=1,
#     instance_type='ml.m5.large',
#     endpoint_name=endpoint_name,
#     wait=True  # Wait until deployment completes
# )
print(f"✅ Endpoint created: {endpoint_name}")


# Test endpoint
sample_review = "Technically brilliant but emotionally empty - all style, no substance."
print("\n🧪 Testing endpoint...")

result = test_endpoint(
    endpoint_name=endpoint_name,
    review_text=sample_review,
    vectorizer_path=vectorizer_s3
)

if 'error' in result:
    print(f"❌ Test failed: {result['error']}")
else:
    print(f"🎯 Prediction: {result['prediction']:.4f} → {result['sentiment']} sentiment")
    print(f"Review: '{result['review']}'")

print("✅ End-to-end workflow completed!")

✅ Endpoint created: imdb-sentiment-endpoint2

🧪 Testing endpoint...
🧹 Preprocessing text...
📥 Loading vectorizer...
✅ Loaded vectorizer from s3://sagemaker-us-east-1-869935066996/IMDBSentimentPipeline02/tlk7sjh05yz9/IMDBDataProcessing_20250805_125856/output/vectorizer/vectorizer.joblib
🔢 Transforming text to features...
📤 Sending request to endpoint...
🎯 Prediction: 0.7339 → POSITIVE sentiment
Review: 'Technically brilliant but emotionally empty - all style, no substance.'
✅ End-to-end workflow completed!


In [33]:
!chmod +x scripts/install_and_run.sh

In [41]:
%%bash
echo "sagemaker-training" > requirements.txt