* [sklearn example](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb)
* [process churn data](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_applying_machine_learning/xgboost_customer_churn/xgboost_customer_churn.ipynb)

In [6]:
!pwd

/root/sagemaker-course/notebooks


In [21]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 70)

In [116]:
# Download original data
#!wget http://dataminingconsultant.com/DKD2e_data_sets.zip
#!apt-get install unzip
#!unzip -o DKD2e_data_sets.zip
#!mv Data\ sets/churn.txt ../data/churn/raw_churn.csv
#!rm -rf Data\ sets
#!rm DKD2e_data_sets.zip

In [114]:
df = pd.read_csv('../data/churn/raw_churn.csv')

df.head().T

Unnamed: 0,0,1,2,3,4
State,KS,OH,NJ,OH,OK
Account Length,128,107,137,84,75
Area Code,415,415,415,408,415
Phone,382-4657,371-7191,358-1921,375-9999,330-6626
Int'l Plan,no,no,no,yes,yes
VMail Plan,yes,yes,no,no,no
VMail Message,25,26,0,0,0
Day Mins,265.1,161.6,243.4,299.4,166.7
Day Calls,110,123,114,71,113
Day Charge,45.07,27.47,41.38,50.9,28.34


In [111]:
%%writefile preprocessing.py

import argparse
import os
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import SimpleImputer, OneHotEncoder, LabelBinarizer
from sklearn.compose import make_column_transformer


if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-test-split-ratio', type=float, default=0.3)
    args, _ = parser.parse_known_args()
    
    print('Received arguments {}'.format(args))

    input_data_path = os.path.join('/opt/ml/processing/input', 'raw_churn.csv')
    
    print('Reading input data from {}'.format(input_data_path))
    df = pd.read_csv(input_data_path)
    df = pd.DataFrame(data=df)
    
    
    # Encode target
    lb = LabelBinarizer()
    label = lb.fit_transform(df['Churn?'])
    df['Churn?'] = label.flatten()
    
    negative_examples, positive_examples = np.bincount(df['Churn?'])
    print('Data after cleaning: {}, {} positive examples, {} negative examples'.format(df.shape, positive_examples, negative_examples))
    
    split_ratio = args.train_test_split_ratio
    print('Splitting data into train and test sets with ratio {}'.format(split_ratio))
    X_train, X_test, y_train, y_test = train_test_split(df.drop('Churn?', axis=1), df['Churn?'], test_size=split_ratio, random_state=0)

    
    numerical_cols = ['Account Length', 'VMail Message', 'Day Mins', 'Day Calls', 'Eve Mins',
                      'Eve Calls', 'Night Mins', 'Night Calls', 'Intl Mins', 'Intl Calls',
                      'CustServ Calls']
    categorical_cols = ["State", "Int'l Plan", "VMail Plan"]

    num_proc = make_pipeline(SimpleImputer(strategy='median'))
    cat_proc = make_pipeline(
        SimpleImputer(strategy='constant', fill_value='missing'),
        OneHotEncoder(handle_unknown='ignore', sparse=False))    
    preprocessor = make_column_transformer((num_proc, numerical_cols),
                                           (cat_proc, categorical_cols))
    print('Running preprocessing and feature engineering transformations')
    train_features = preprocess.fit_transform(X_train)
    test_features = preprocess.transform(X_test)
    
    print('Train data shape after preprocessing: {}'.format(train_features.shape))
    print('Test data shape after preprocessing: {}'.format(test_features.shape))
    
    train_features_output_path = os.path.join('/opt/ml/processing/train', 'train_features.csv')
    train_labels_output_path = os.path.join('/opt/ml/processing/train', 'train_labels.csv')
    
    test_features_output_path = os.path.join('/opt/ml/processing/test', 'test_features.csv')
    test_labels_output_path = os.path.join('/opt/ml/processing/test', 'test_labels.csv')
    
    print('Saving training features to {}'.format(train_features_output_path))
    pd.DataFrame(train_features).to_csv(train_features_output_path, header=False, index=False)
    
    print('Saving test features to {}'.format(test_features_output_path))
    pd.DataFrame(test_features).to_csv(test_features_output_path, header=False, index=False)
    
    print('Saving training labels to {}'.format(train_labels_output_path))
    y_train.to_csv(train_labels_output_path, header=False, index=False)
    
    print('Saving test labels to {}'.format(test_labels_output_path))
    y_test.to_csv(test_labels_output_path, header=False, index=False)

Writing preprocessing.py
