sklearn_preprocessor.fit({'train': train_path}, logs=False) 

1- call fit method of Scikit Pipeline

transformer.transform(train_path, content_type="text/csv",  logs=False) 

1 - pass s3 csv file

2- call input_fn() that reads the csv file

3 - call predict_fn() that uses Scikit transform() method to transform the csv data

4- call output_fn that format the output and send it to s3 as .csv.out

In [2]:
!pip install polars

[0m

In [163]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, Binarizer

import os

import csv
import json
import joblib
import polars as pl
import numpy as np
from io import StringIO

feature_columns_names = ['credit_score',
                         'country',
                         'gender',
                         'age',
                         'tenure',
                         'balance',
                         'products_number',
                         'credit_card',
                         'active_member',
                         'estimated_salary'
                        ]

train_path = "s3://sagemaker-us-east-1-484401254725/scikit-churn-prediction/datasets/train.csv"

input_data = pl.read_csv(train_path)

Simulate Pipeline and save model 

In [164]:
# Define numeric features and transformer
numeric_features = (
        input_data.select([pl.col(pl.Int64), pl.col(pl.Float64)])
        .select(pl.all().exclude("churn"))
        .columns
)
    
numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", RobustScaler()),
        ]
)
    
    # Define categorical features and transformer
categorical_features = (
        input_data.select(pl.col(pl.Utf8))
        .select(pl.all().exclude("churn"))
        .columns
)

categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehotencoder", OneHotEncoder(handle_unknown="ignore")),
        ]
)

    # Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
)
    
df_pandas = input_data.select(pl.all().exclude("churn")).to_pandas()
preprocessor.fit(df_pandas)

joblib.dump(preprocessor, os.path.join('.', "model.joblib"))
        

['./model.joblib']

In [165]:
def input_fn(input_data, content_type="text/csv"):
    """Parse input data payload
    """    
    if content_type == 'text/csv':
        df = pl.read_csv(input_data)
        
        # is input_data  what type?
        if len(df.columns) == len(feature_columns_names) + 1:
            # This is a labelled example, includes the ring label
            df.columns = feature_columns_names + ["churn"]
        elif len(df.columns) == len(feature_columns_names):
            # This is an unlabelled example.
            df.columns = feature_columns_names
        return df.to_pandas()
    else:
        raise ValueError("{} not supported by script!".format(content_type))

Simulate labelled data

In [166]:
input_data = input_fn(train_path, content_type="text/csv")
print("Input shape: ", input_data.shape)
input_data.head()

Input shape:  (7200, 11)


Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,538,Spain,Male,68,9,0.0,2,1,0,110440.5,1
1,809,France,Female,39,5,0.0,1,1,0,77705.75,0
2,620,France,Male,32,7,0.0,2,1,1,34665.79,0
3,679,Spain,Male,33,4,96110.22,1,1,0,1173.23,0
4,575,Spain,Male,30,2,0.0,2,1,1,82222.86,0


In [170]:
def predict_fn(input_data, model):
        """ Preprocess input data and make predictions.
        Modify the predict_fn to use .transform() instead of .predict()"""
        
        features = model.transform(input_data)
        
        if "churn" in input_data:
            return np.insert(features, 0, input_data["churn"], axis=1)
        else:
            return features

In [171]:
def model_fn(model_dir='.'):
        """ Deserialize fitted model from model_dir."""
        preprocessor = joblib.load(os.path.join(model_dir, "model.joblib"))
        return preprocessor

Simulate batch data transformation 

In [172]:
input_data = pl.read_csv("s3://sagemaker-us-east-1-484401254725/scikit-churn-prediction/datasets/validation.csv").select(pl.all().exclude("churn")).to_pandas()
print("Input shape: ", input_data.shape)
pl.DataFrame(predict_fn(input_data, model_fn(model_dir='.'))).head()

Input shape:  (800, 10)


column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.102421,-0.416667,-0.2,0.0,0.0,0.0,-0.169827,-0.633851,1.0,0.0,0.0,1.0,0.0
0.841713,-0.75,-0.2,0.0,-1.0,0.0,-0.048301,-0.800779,0.0,1.0,0.0,1.0,0.0
1.303538,-0.75,0.6,0.0,0.0,-1.0,0.296641,-0.209319,0.0,0.0,1.0,0.0,1.0
1.080074,-0.083333,-0.8,1.0,0.0,0.0,-0.758196,0.606703,1.0,0.0,0.0,0.0,1.0
-0.312849,-0.25,-0.2,1.0,0.0,-1.0,-0.758196,0.728628,0.0,0.0,1.0,0.0,1.0


In [17]:
path = 's3://sagemaker-us-east-1-484401254725/scikit-churn-prediction/datasets/test.csv'
path2 = 's3://sagemaker-us-east-1-484401254725/scikit-churn-prediction/sklear-pipeline/transform/test.csv.out'

In [176]:
columns_name = [ 'churn',
                 'credit_score',
                 'country.A',
                 'country.B',
                 'country.C',
                 'gender.A',
                 'gender.B',
                 'age',
                 'tenure',
                 'balance',
                 'products_number',
                 'credit_card',
                 'active_member',
                 'estimated_salary' ]

In [199]:
path = 's3://sagemaker-us-east-1-484401254725/scikit-churn-prediction/sklear-pipeline/transform/test.csv.out'
df = pl.read_csv(path, has_header=False)
df.columns = columns_name
df = df.drop("churn")
df.head(15)

credit_score,country.A,country.B,country.C,gender.A,gender.B,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-0.424581,-0.416667,-0.4,1.0,-1.0,-1.0,-0.001242,-0.594018,0.0,1.0,0.0,0.0,1.0
-0.223464,0.5,-0.8,1.0,0.0,0.0,-0.758196,0.477013,1.0,0.0,0.0,0.0,1.0
-0.387337,0.583333,-0.2,1.0,0.0,-1.0,-0.758196,-0.42226,0.0,0.0,1.0,1.0,0.0
-1.094972,1.833333,0.6,1.0,0.0,0.0,0.174422,0.725854,0.0,1.0,0.0,0.0,1.0
-0.692737,-0.833333,0.4,0.0,0.0,0.0,0.220163,0.152302,0.0,0.0,1.0,1.0,0.0
1.020484,0.0,0.6,1.0,0.0,0.0,-0.758196,0.508134,0.0,0.0,1.0,0.0,1.0
-1.594041,-0.416667,-0.4,0.0,0.0,-1.0,0.329004,-0.2469,0.0,0.0,1.0,1.0,0.0
-0.417132,-1.25,0.2,0.0,0.0,-1.0,0.036481,-0.299708,0.0,1.0,0.0,1.0,0.0
0.18622,0.25,-0.2,0.0,0.0,-1.0,0.132485,-0.851759,0.0,0.0,1.0,1.0,0.0
-1.407821,0.416667,-0.4,0.0,0.0,0.0,-0.087574,0.658525,0.0,1.0,0.0,1.0,0.0


### Read json in Models input_fn

In [None]:
import json

json_data = '{"instances": [{"features": [0.0, -0.22346368715083798, 0.5, -0.8, 1.0, 0.0, 0.0, -0.758196039886628, 0.4770132292001497, 1.0, 0.0, 0.0, 0.0, 1.0]}]}'
data = json.loads(json_data)
df = pl.DataFrame(data['instances'])
df = pl.DataFrame(df['features'].to_list(), orient="row")

print(df)


shape: (14, 1)
┌───────────┐
│ column_0  │
│ ---       │
│ f64       │
╞═══════════╡
│ 0.0       │
│ -0.223464 │
│ 0.5       │
│ -0.8      │
│ …         │
│ 0.0       │
│ 0.0       │
│ 0.0       │
│ 1.0       │
└───────────┘


In [198]:
sampInput = [[0.09178, 0.0, 4.05, 0.0, 0.51, 6.416, 84.1, 2.6463, 5.0, 296.0, 16.6, 395.5, 9.04]]
print(type(sampInput))

<class 'list'>


In [211]:
df.select(pl.col(pl.Float64)).drop("age").columns
df

credit_score,country.A,country.B,country.C,gender.A,gender.B,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-0.424581,-0.416667,-0.4,1.0,-1.0,-1.0,-0.001242,-0.594018,0.0,1.0,0.0,0.0,1.0
-0.223464,0.5,-0.8,1.0,0.0,0.0,-0.758196,0.477013,1.0,0.0,0.0,0.0,1.0
-0.387337,0.583333,-0.2,1.0,0.0,-1.0,-0.758196,-0.42226,0.0,0.0,1.0,1.0,0.0
-1.094972,1.833333,0.6,1.0,0.0,0.0,0.174422,0.725854,0.0,1.0,0.0,0.0,1.0
-0.692737,-0.833333,0.4,0.0,0.0,0.0,0.220163,0.152302,0.0,0.0,1.0,1.0,0.0
1.020484,0.0,0.6,1.0,0.0,0.0,-0.758196,0.508134,0.0,0.0,1.0,0.0,1.0
-1.594041,-0.416667,-0.4,0.0,0.0,-1.0,0.329004,-0.2469,0.0,0.0,1.0,1.0,0.0
-0.417132,-1.25,0.2,0.0,0.0,-1.0,0.036481,-0.299708,0.0,1.0,0.0,1.0,0.0
0.18622,0.25,-0.2,0.0,0.0,-1.0,0.132485,-0.851759,0.0,0.0,1.0,1.0,0.0
-1.407821,0.416667,-0.4,0.0,0.0,0.0,-0.087574,0.658525,0.0,1.0,0.0,1.0,0.0
