In [43]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd

In [None]:
### Submit the pipeline to SageMaker and start execution

In [44]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


In [45]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    ## Convert to datetime columns
    df["firstorder"]=pd.to_datetime(df["firstorder"],errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"],errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff']=(df['created']-df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid','created','firstorder','lastorder'],axis=1,inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday','city'],columns=['favday','city'])
    return df

In [46]:
default_bucket_data = 'sagemaker-studio-891376975801-3j1752jty8h'
baseline_data = preprocess_data(f"s3://{default_bucket_data}/data/storedata_total.csv")
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

In [47]:
pd.DataFrame(baseline_sample).to_csv(f"s3://{default_bucket_data}/data/baseline.csv",header=False,index=False)

In [48]:
batch_data = preprocess_data(f"s3://{default_bucket_data}/data/storedata_total.csv")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)

In [49]:
pd.DataFrame(batch_sample).to_csv(f"s3://{default_bucket_data}/data/batch.csv",header=False,index=False)

In [23]:
s3_client = boto3.resource('s3')
s3_client.Bucket(default_bucket)
# s3_client.Bucket(default_bucket).upload_file(f"s3://{default_bucket_data}/data/storedata_total.csv",f"s3://{default_bucket}/data/storedata_total.csv")
# s3_client.Bucket(default_bucket).upload_file(f"s3://{default_bucket_data}/data/batch.csv",f"s3://{default_bucket}/data/batch.csv")
# s3_client.Bucket(default_bucket).upload_file(f"s3://{default_bucket_data}/data/baseline.csv",f"s3://{default_bucket}/data/baseline.csv")

s3.Bucket(name='sagemaker-studio-891376975801-pwz3ttqzi5n')

In [26]:
s3_client.Bucket(default_bucket).upload_file("churn-modelling/pipelines/customerchurn/preprocess.py","input/code/preprocess.py")

In [27]:
s3_client.Bucket(default_bucket).upload_file("churn-modelling/pipelines/customerchurn/evaluate.py","input/code/evaluate.py")
s3_client.Bucket(default_bucket).upload_file("churn-modelling/pipelines/customerchurn/generate_config.py","input/code/generate_config.py")

In [29]:
!pip install pipelines

Collecting pipelines
  Downloading pipelines-0.0.14.tar.gz (161 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.9/161.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting futures==3.0.5 (from pipelines)
  Downloading futures-3.0.5.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting Jinja2==2.8 (from pipelines)
  Downloading Jinja2-2.8-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting PyYAML==3.11 (from pipelines)
  Downloading PyYAML-3.11.zip (371 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m371.8/371.8 kB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting requests==2.9.1 (from pipelines)
  Downloading requests-2.9.1-py2.py3-none-any.whl.metadata (36 kB)
Collecting sh==1.11 (from pipelines)
  Downloading sh-1.11.tar.gz (36 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting 

In [50]:
from churn_modelling.pipelines.customerchurn.pipeline import get_pipeline

pipeline = get_pipeline(
    region = region,
    role=role,
    default_bucket=default_bucket_data,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    custom_image_uri=clarify_image,
    sklearn_processor_version=sklearn_processor_version
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [51]:
pipeline.definition()



'{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "ProcessingInstanceCount", "Type": "Integer", "DefaultValue": 1}, {"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "InputData", "Type": "String", "DefaultValue": "s3://sagemaker-studio-891376975801-3j1752jty8h/data/storedata_total.csv"}, {"Name": "BatchData", "Type": "String", "DefaultValue": "s3://sagemaker-studio-891376975801-3j1752jty8h/data/batch/batch.csv"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnModelProcess", "Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": {"InstanceType": {"Get": "Parameters.ProcessingInstanceType"}, "InstanceCount": {"Get": "Parameters.ProcessingInstanceCount"}, "VolumeSizeInGB": 30}}, "AppSpecification": {"ImageUri

In [52]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-2:891376975801:pipeline/ChurnModelSMPipeline',
 'ResponseMetadata': {'RequestId': '5ee5e2d7-2cb4-414b-abd3-1c9799123e8c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5ee5e2d7-2cb4-414b-abd3-1c9799123e8c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Sun, 09 Jun 2024 15:18:49 GMT'},
  'RetryAttempts': 0}}

In [53]:
execution = pipeline.start()

In [54]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-2:891376975801:pipeline/ChurnModelSMPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-2:891376975801:pipeline/ChurnModelSMPipeline/execution/y9t61wkdsxru',
 'PipelineExecutionDisplayName': 'execution-1717946339600',
 'PipelineExecutionStatus': 'Failed',
 'PipelineExperimentConfig': {'ExperimentName': 'churnmodelsmpipeline',
  'TrialName': 'y9t61wkdsxru'},
 'FailureReason': 'Step failure: One or multiple steps failed.',
 'CreationTime': datetime.datetime(2024, 6, 9, 15, 18, 59, 510000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 6, 9, 15, 19, 1, 646000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-2:891376975801:user-profile/d-8lx0bg5nslfb/default-20240608t155149',
  'UserProfileName': 'default-20240608t155149',
  'DomainId': 'd-8lx0bg5nslfb',
  'IamIdentity': {'Arn': 'arn:aws:sts::891376975801:assumed-role/AmazonSageMaker-ExecutionRole-20240608T155149/SageMaker',
   'PrincipalI