In [1]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
default_bucket = "customer-churn-sm-pipeline-na-1"

In [3]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)

In [4]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    # Convert to datetime columns
    df["firstorder"] = pd.to_datetime(df["firstorder"], errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"], errors='coerce')
    # Drop Rows with null values
    df = df.dropna()
    # Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    # Create Column which gives the days between when
    # the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff'] = (df['created']-df['firstorder']).dt.days
    # Drop Columns
    df.drop(['custid', 'created', 'firstorder', 'lastorder'], axis=1,  inplace=True)
    # Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df, prefix=['favday', 'city'], columns=['favday', 'city'], dtype=float)
    return df

In [5]:
baseline_data = preprocess_data("data/storedata_total.csv")
print(baseline_data.head())
print(baseline_data.columns)
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

   retained  esent   eopenrate  eclickrate  avgorder   ordfreq  paperless  \
0         0     29  100.000000    3.448276     14.52  0.000000          0   
1         1     95   92.631579   10.526316     83.69  0.181641          1   
2         0      0    0.000000    0.000000     33.58  0.059908          0   
3         0      0    0.000000    0.000000     54.96  0.000000          0   
4         1     30   90.000000   13.333333    111.91  0.008850          0   

   refill  doorstep  first_last_days_diff  ...  favday_Monday  \
0       0         0                     0  ...            1.0   
1       1         1                  1024  ...            0.0   
2       0         0                   217  ...            0.0   
3       0         0                     0  ...            0.0   
4       0         0                   791  ...            1.0   

   favday_Saturday  favday_Sunday  favday_Thursday  favday_Tuesday  \
0              0.0            0.0              0.0             0.0   
1     

In [6]:
pd.DataFrame(baseline_sample).to_csv("data/baseline.csv", header=False, index=False)

In [7]:
batch_data = preprocess_data("data/storedata_total.csv")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)

In [8]:
pd.DataFrame(batch_sample).to_csv("data/batch.csv", header=False, index=False, sep=',')

In [9]:
s3_client = boto3.resource('s3')
s3_client.Bucket(default_bucket).upload_file("data/storedata_total.csv","data/storedata_total.csv")
s3_client.Bucket(default_bucket).upload_file("data/batch.csv","data/batch/batch.csv")
s3_client.Bucket(default_bucket).upload_file("data/baseline.csv","input/baseline/baseline.csv")

In [10]:
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/preprocess.py","input/code/preprocess.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/evaluate.py","input/code/evaluate.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/generate_config.py","input/code/generate_config.py")

In [11]:
from pipelines.customerchurn.pipeline import get_pipeline

pipeline = get_pipeline(
    region=region,
    role=role,
    default_bucket=default_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    custom_image_uri=clarify_image,
    sklearn_processor_version=sklearn_processor_version
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [12]:
pipeline.definition()



'{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "ProcessingInstanceCount", "Type": "Integer", "DefaultValue": 1}, {"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "InputData", "Type": "String", "DefaultValue": "s3://customer-churn-sm-pipeline-na-1/data/storedata_total.csv"}, {"Name": "BatchData", "Type": "String", "DefaultValue": "s3://customer-churn-sm-pipeline-na-1/data/batch/batch.csv"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnModelProcess", "Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": {"InstanceType": {"Get": "Parameters.ProcessingInstanceType"}, "InstanceCount": {"Get": "Parameters.ProcessingInstanceCount"}, "VolumeSizeInGB": 30}}, "AppSpecification": {"ImageUri": "683313688378.dkr

In [13]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:471112960248:pipeline/ChurnModelSMPipeline',
 'ResponseMetadata': {'RequestId': 'e3bcd1ed-117e-4674-9705-6e90a8d7261d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e3bcd1ed-117e-4674-9705-6e90a8d7261d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Wed, 31 Jul 2024 18:36:03 GMT'},
  'RetryAttempts': 0}}

In [14]:
execution = pipeline.start()

In [17]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:471112960248:pipeline/ChurnModelSMPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:471112960248:pipeline/ChurnModelSMPipeline/execution/i6q3b4ezebzv',
 'PipelineExecutionDisplayName': 'execution-1722450963951',
 'PipelineExecutionStatus': 'Succeeded',
 'PipelineExperimentConfig': {'ExperimentName': 'churnmodelsmpipeline',
  'TrialName': 'i6q3b4ezebzv'},
 'CreationTime': datetime.datetime(2024, 7, 31, 18, 36, 3, 893000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 7, 31, 19, 4, 54, 147000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:471112960248:user-profile/d-o8svfhlitnen/default-1722447979041',
  'UserProfileName': 'default-1722447979041',
  'DomainId': 'd-o8svfhlitnen',
  'IamIdentity': {'Arn': 'arn:aws:sts::471112960248:assumed-role/AmazonSageMaker-ExecutionRole-20240430T143597/SageMaker',
   'PrincipalId': 'AROAW3MEFJD4IBRMZ6YCF:SageMaker'}},
 'LastModifiedBy': {'Use

In [18]:
execution.list_steps()

[{'StepName': 'ClarifyProcessingStep',
  'StartTime': datetime.datetime(2024, 7, 31, 18, 47, 17, 433000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 7, 31, 19, 4, 53, 543000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:471112960248:processing-job/pipelines-i6q3b4ezebzv-ClarifyProcessingSte-LE6dHlBc87'}},
  'AttemptCount': 1},
 {'StepName': 'ChurnModelConfigFile',
  'StartTime': datetime.datetime(2024, 7, 31, 18, 44, 43, 890000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 7, 31, 18, 47, 16, 760000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:471112960248:processing-job/pipelines-i6q3b4ezebzv-ChurnModelConfigFile-4Hj89yF3ae'}},
  'AttemptCount': 1},
 {'StepName': 'ChurnTransform',
  'StartTime': datetime.datetime(2024, 7, 31, 18, 44, 43, 890000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 7, 31, 18, 50, 13, 8