* [sklearn example](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb)
* [process churn data](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_applying_machine_learning/xgboost_customer_churn/xgboost_customer_churn.ipynb)

In [6]:
!pwd

/root/sagemaker-course/notebooks


In [21]:
import pandas as pd

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 70)


In [1]:
# Download original data
#!wget http://dataminingconsultant.com/DKD2e_data_sets.zip
#!apt-get install unzip
#!unzip -o DKD2e_data_sets.zip -d ./data/

In [2]:
!head -5 Data\ sets/churn.txt

State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
KS,128,415,382-4657,no,yes,25,265.100000,110,45.070000,197.400000,99,16.780000,244.700000,91,11.010000,10.000000,3,2.700000,1,False.
OH,107,415,371-7191,no,yes,26,161.600000,123,27.470000,195.500000,103,16.620000,254.400000,103,11.450000,13.700000,3,3.700000,1,False.
NJ,137,415,358-1921,no,no,0,243.400000,114,41.380000,121.200000,110,10.300000,162.600000,104,7.320000,12.200000,5,3.290000,0,False.
OH,84,408,375-9999,yes,no,0,299.400000,71,50.900000,61.900000,88,5.260000,196.900000,89,8.860000,6.600000,7,1.780000,2,False.


In [15]:
df = pd.read_csv('Data sets/churn.txt')

df.head().T

Unnamed: 0,0,1,2,3,4
State,KS,OH,NJ,OH,OK
Account Length,128,107,137,84,75
Area Code,415,415,415,408,415
Phone,382-4657,371-7191,358-1921,375-9999,330-6626
Int'l Plan,no,no,no,yes,yes
VMail Plan,yes,yes,no,no,no
VMail Message,25,26,0,0,0
Day Mins,265.1,161.6,243.4,299.4,166.7
Day Calls,110,123,114,71,113
Day Charge,45.07,27.47,41.38,50.9,28.34


In [23]:
make_column_transformer?

[0;31mSignature:[0m [0mmake_column_transformer[0m[0;34m([0m[0;34m*[0m[0mtransformers[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Construct a ColumnTransformer from the given transformers.

This is a shorthand for the ColumnTransformer constructor; it does not
require, and does not permit, naming the transformers. Instead, they will
be given names automatically based on their types. It also does not allow
weighting with ``transformer_weights``.

Read more in the :ref:`User Guide <make_column_transformer>`.

Parameters
----------
*transformers : tuples
    Tuples of the form (transformer, column(s)) specifying the
    transformer objects to be applied to subsets of the data.

    transformer : estimator or {'passthrough', 'drop'}
        Estimator must support :term:`fit` and :term:`transform`.
        Special-cased strings 'drop' and 'passthrough' are accepted as
        well, to indicate to drop the columns or to pass

In [36]:
preprocess = make_column_transformer(
        (OneHotEncoder(sparse=False), ["State", "Int'l Plan", "VMail Plan"]),
    )
train_features = preprocess.fit_transform(df)

In [37]:
preprocess.get_feature_names()

['onehotencoder__x0_AK',
 'onehotencoder__x0_AL',
 'onehotencoder__x0_AR',
 'onehotencoder__x0_AZ',
 'onehotencoder__x0_CA',
 'onehotencoder__x0_CO',
 'onehotencoder__x0_CT',
 'onehotencoder__x0_DC',
 'onehotencoder__x0_DE',
 'onehotencoder__x0_FL',
 'onehotencoder__x0_GA',
 'onehotencoder__x0_HI',
 'onehotencoder__x0_IA',
 'onehotencoder__x0_ID',
 'onehotencoder__x0_IL',
 'onehotencoder__x0_IN',
 'onehotencoder__x0_KS',
 'onehotencoder__x0_KY',
 'onehotencoder__x0_LA',
 'onehotencoder__x0_MA',
 'onehotencoder__x0_MD',
 'onehotencoder__x0_ME',
 'onehotencoder__x0_MI',
 'onehotencoder__x0_MN',
 'onehotencoder__x0_MO',
 'onehotencoder__x0_MS',
 'onehotencoder__x0_MT',
 'onehotencoder__x0_NC',
 'onehotencoder__x0_ND',
 'onehotencoder__x0_NE',
 'onehotencoder__x0_NH',
 'onehotencoder__x0_NJ',
 'onehotencoder__x0_NM',
 'onehotencoder__x0_NV',
 'onehotencoder__x0_NY',
 'onehotencoder__x0_OH',
 'onehotencoder__x0_OK',
 'onehotencoder__x0_OR',
 'onehotencoder__x0_PA',
 'onehotencoder__x0_RI',


In [105]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.impute import SimpleImputer


df = pd.read_csv('Data sets/churn.txt')

lb = LabelBinarizer()
label = lb.fit_transform(df['Churn?'])
df['Churn?'] = label.flatten()


num_proc = make_pipeline(SimpleImputer(strategy='median'))

cat_proc = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore', sparse=False))

target = ['Churn?']
numerical_cols = ['Account Length', 'VMail Message', 'Day Mins', 'Day Calls', 'Eve Mins',
                  'Eve Calls', 'Night Mins', 'Night Calls', 'Intl Mins', 'Intl Calls',
                  'CustServ Calls']
categorical_cols = ["State", "Int'l Plan", "VMail Plan"]

preprocessor = make_column_transformer((num_proc, numerical_cols),
                                       (cat_proc, categorical_cols))

data = preprocessor.fit_transform(df)

# Retrieve the encoded categorical column names
encoded_categorical_cols = (preprocessor.named_transformers_['pipeline-2']
                                        .named_steps['onehotencoder']
                                        .get_feature_names(categorical_cols)
                                        .tolist())

# Construct dataframe with processed data
processed_df = pd.DataFrame(data, columns=numerical_cols + encoded_categorical_cols)
processed_df['Churn'] = df['Churn?']

processed_df.head().T

Unnamed: 0,0,1,2,3,4
Account Length,128.0,107.0,137.0,84.0,75.0
VMail Message,25.0,26.0,0.0,0.0,0.0
Day Mins,265.1,161.6,243.4,299.4,166.7
Day Calls,110.0,123.0,114.0,71.0,113.0
Eve Mins,197.4,195.5,121.2,61.9,148.3
Eve Calls,99.0,103.0,110.0,88.0,122.0
Night Mins,244.7,254.4,162.6,196.9,186.9
Night Calls,91.0,103.0,104.0,89.0,121.0
Intl Mins,10.0,13.7,12.2,6.6,10.1
Intl Calls,3.0,3.0,5.0,7.0,3.0


In [16]:
proc_df = pd.read_csv('../data/churn/training-dataset-with-header.csv')
proc_df.head().T

Unnamed: 0,0,1,2,3,4
Churn,0.0,0.0,1.0,0.0,0.0
Account Length,106.0,28.0,148.0,132.0,92.0
VMail Message,0.0,0.0,0.0,0.0,29.0
Day Mins,274.4,187.8,279.3,191.9,155.4
Day Calls,120.0,94.0,104.0,107.0,110.0
Eve Mins,198.6,248.6,201.6,206.9,188.5
Eve Calls,82.0,86.0,87.0,127.0,104.0
Night Mins,160.8,208.8,280.8,272.0,254.9
Night Calls,62.0,124.0,99.0,88.0,118.0
Intl Mins,6.0,10.6,7.9,12.6,8.0


In [None]:
# Save this to a file
#%%writefile preprocessing.py


import argparse
import os
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)


columns = ['age', 'education', 'major industry code', 'class of worker', 'num persons worked for employer',
           'capital gains', 'capital losses', 'dividends from stocks', 'income']
class_labels = [' - 50000.', ' 50000+.']

def print_shape(df):
    negative_examples, positive_examples = np.bincount(df['income'])
    print('Data shape: {}, {} positive examples, {} negative examples'.format(df.shape, positive_examples, negative_examples))

if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-test-split-ratio', type=float, default=0.3)
    args, _ = parser.parse_known_args()
    
    print('Received arguments {}'.format(args))

    input_data_path = os.path.join('/opt/ml/processing/input', 'census-income.csv')
    
    print('Reading input data from {}'.format(input_data_path))
    df = pd.read_csv(input_data_path)
    df = pd.DataFrame(data=df, columns=columns)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.replace(class_labels, [0, 1], inplace=True)
    
    negative_examples, positive_examples = np.bincount(df['income'])
    print('Data after cleaning: {}, {} positive examples, {} negative examples'.format(df.shape, positive_examples, negative_examples))
    
    split_ratio = args.train_test_split_ratio
    print('Splitting data into train and test sets with ratio {}'.format(split_ratio))
    X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df['income'], test_size=split_ratio, random_state=0)

    preprocess = make_column_transformer(
        (['age', 'num persons worked for employer'], KBinsDiscretizer(encode='onehot-dense', n_bins=10)),
        (['capital gains', 'capital losses', 'dividends from stocks'], StandardScaler()),
        (['education', 'major industry code', 'class of worker'], OneHotEncoder(sparse=False))
    )
    print('Running preprocessing and feature engineering transformations')
    train_features = preprocess.fit_transform(X_train)
    test_features = preprocess.transform(X_test)
    
    print('Train data shape after preprocessing: {}'.format(train_features.shape))
    print('Test data shape after preprocessing: {}'.format(test_features.shape))
    
    train_features_output_path = os.path.join('/opt/ml/processing/train', 'train_features.csv')
    train_labels_output_path = os.path.join('/opt/ml/processing/train', 'train_labels.csv')
    
    test_features_output_path = os.path.join('/opt/ml/processing/test', 'test_features.csv')
    test_labels_output_path = os.path.join('/opt/ml/processing/test', 'test_labels.csv')
    
    print('Saving training features to {}'.format(train_features_output_path))
    pd.DataFrame(train_features).to_csv(train_features_output_path, header=False, index=False)
    
    print('Saving test features to {}'.format(test_features_output_path))
    pd.DataFrame(test_features).to_csv(test_features_output_path, header=False, index=False)
    
    print('Saving training labels to {}'.format(train_labels_output_path))
    y_train.to_csv(train_labels_output_path, header=False, index=False)
    
    print('Saving test labels to {}'.format(test_labels_output_path))
    y_test.to_csv(test_labels_output_path, header=False, index=False)