# Feature Engineering TRAIN/TEST Datasets

## 1. TRAIN/TEST Datasets

In [1]:
%%time

### 0. Import Required Packages
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
%matplotlib inline

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

### Import Customer Transformers
# Include 'competitive_area' as a feature
import CLUSTER_FeatureEngineering as FE

# Use the Updated Attribute/Imputation Dictionaries!!!
%run 'data_new/attribute_dictionary.py'
%run 'data_new/imputation_dictionary.py'

# Remove DataConversionWarning
import warnings
warnings.simplefilter('ignore')

CPU times: user 1.39 s, sys: 1.35 s, total: 2.74 s
Wall time: 18.5 s


### TRAIN

In [2]:
%%time

df_train_Fios    = pd.read_pickle('data_new/CLUSTER_df_train_Fios_PP.pkl')
df_train_FiosONT = pd.read_pickle('data_new/CLUSTER_df_train_FiosONT_PP.pkl')
df_train_PreFios = pd.read_pickle('data_new/CLUSTER_df_train_PreFios_PP.pkl')
df_train_Non     = pd.read_pickle('data_new/CLUSTER_df_train_Non_PP.pkl')
df_train_Uverse  = pd.read_pickle('data_new/CLUSTER_df_train_Uverse_PP.pkl')

df_train         = pd.concat([df_train_Fios, df_train_FiosONT, df_train_PreFios, 
                              df_train_Non, df_train_Uverse], axis=0)
df_train.sort_index(inplace=True)
del df_train_Fios, df_train_FiosONT, df_train_PreFios, df_train_Non, df_train_Uverse

print(df_train.shape)
print(df_train.index)

(1662348, 1023)
Index(['780100000203', '780100001104', '780100001202', '780100001501',
       '780100001901', '780100002401', '780100002602', '780100003005',
       '780100003703', '780100004301',
       ...
       '788365916901', '788365917501', '788365922901', '788365923301',
       '788365923501', '788365924201', '788365924501', '788365932001',
       '788365935101', '788365935401'],
      dtype='object', name='chc_id', length=1662348)
CPU times: user 2min 31s, sys: 1min 29s, total: 4min 1s
Wall time: 5min 59s


### Test

In [3]:
%%time

df_test_Fios    = pd.read_pickle('data_new/CLUSTER_df_test_Fios_PP.pkl')
df_test_FiosONT = pd.read_pickle('data_new/CLUSTER_df_test_FiosONT_PP.pkl')
df_test_PreFios = pd.read_pickle('data_new/CLUSTER_df_test_PreFios_PP.pkl')
df_test_Non     = pd.read_pickle('data_new/CLUSTER_df_test_Non_PP.pkl')
df_test_Uverse  = pd.read_pickle('data_new/CLUSTER_df_test_Uverse_PP.pkl')

df_test         = pd.concat([df_test_Fios, df_test_FiosONT, df_test_PreFios, 
                              df_test_Non, df_test_Uverse], axis=0)
df_test.sort_index(inplace=True)
del df_test_Fios, df_test_FiosONT, df_test_PreFios, df_test_Non, df_test_Uverse

print(df_test.shape)
print(df_test.index)

(1656204, 1023)
Index(['780100000203', '780100001104', '780100001202', '780100001501',
       '780100001901', '780100002401', '780100002602', '780100003005',
       '780100003703', '780100004301',
       ...
       '788365979801', '788365980001', '788365980101', '788365981201',
       '788365981601', '788365982401', '788365983001', '788365983201',
       '788365985701', '788365985801'],
      dtype='object', name='chc_id', length=1656204)
CPU times: user 2min 34s, sys: 1min 12s, total: 3min 46s
Wall time: 5min 32s


### TRAIN/TEST

In [4]:
# TRAIN
train_X = df_train.drop('status', axis=1).copy()
train_y = df_train['status']

# TEST
test_X  = df_test.drop('status', axis=1).copy()
test_y  = df_test['status']

# Sample Size
print('*'*50 + '\nTRAIN vs TEST Datasets\n' + '*'*50)
# print('Competitive Area: ', df_train.competitive_area.unique())
print('The Shape of TRAIN Data: ' + str(df_train.shape))
print('The Shape of TEST Data:  ' + str(df_test.shape))

## Churn Rate by Sample Type
print('\n' + '*'*50 + '\nOverall Churn Rate\n' + '*'*50)
print('TRAIN: ', df_train.status.value_counts(normalize=True)[1].round(4))
print('TEST:  ', df_test.status.value_counts(normalize=True)[1].round(4), '\n')

del df_train, df_test

**************************************************
TRAIN vs TEST Datasets
**************************************************
The Shape of TRAIN Data: (1662348, 1023)
The Shape of TEST Data:  (1656204, 1023)

**************************************************
Overall Churn Rate
**************************************************
TRAIN:  0.0154
TEST:   0.0153 



In [5]:
%%time

# (1) Make a Pipeline and Instantiate
Pipe_NF = FE.FeatureMaker()


# (2) fit()
Pipe_NF.fit(train_X, train_y)


# (3) transform()
train_X_NF = Pipe_NF.transform(train_X)
test_X_NF  = Pipe_NF.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Transformation\n' + '*'*50)
print('TRAIN: Before Transformation:' + str(train_X.shape))
print('TRAIN: After Transformation: ' + str(train_X_NF.shape))
print('TEST:  After Transformation: ' + str(test_X_NF.shape))
print('\n' + '*'*50 + '\nNewly Created Features\n' + '*'*50 + '\n', Pipe_NF.features_new_)


# Create Datasets that Consist of Pre-processed and New Features.
train_X = train_X_NF.merge(train_X, how='inner', left_index=True, right_index=True)
test_X  = test_X_NF.merge(test_X, how='inner', left_index=True, right_index=True)

print(train_X.shape)
print(test_X.shape)

del train_X_NF, test_X_NF


**************************************************
Before vs After Transformation
**************************************************
TRAIN: Before Transformation:(1662348, 1022)
TRAIN: After Transformation: (1662348, 16)
TEST:  After Transformation: (1656204, 16)

**************************************************
Newly Created Features
**************************************************
 ['grp_tenure_3m', 'grp_tenure_1m', 'grp_tenure_6m', 'grp_payment_method', 'grp_payment_25dollar', 'grp_payment_10dollar', 'grp_payment_change_5dollar', 'grp_payment_change_10dollar', 'grp_payment_change_2pct', 'grp_payment_change_5pct', 'ratio_payment_income', 'grp_payment_income', 'grp_call_csc', 'grp_call_bill', 'grp_call_csr', 'grp_call_tsr']
(1662348, 1038)
(1656204, 1038)
CPU times: user 1min 55s, sys: 13 s, total: 2min 8s
Wall time: 2min 3s


## 2. Feature Engineering

### Create/Use a Meta Custom Transfomer for Feature Engineering

In [6]:
%%time

# (1) Make a Pipeline in Parallel/Sequence and Instantiate 
# List of Features Used as Parameters
fe_1st           = ['grp_tenure_3m', 'grp_payment_method', \
                    'grp_payment_25dollar', 'grp_payment_change_10dollar', 'grp_payment_change_5pct', \
                    'grp_payment_income', 'grp_call_csc', 'grp_call_bill', \
                    'grp_call_csr', 'grp_call_tsr']
fe_2nd           = fe_1st + ['income_demos', 'ethnic', 'age_demos', 'archetype']
fe_group         = ['census', 'cleansed_city', 'cleansed_zipcode']

# Custom Transformers in Sequence for CATEGORICAL Features
CAT_Pipe          = Pipeline([
                    ('Interaction', FE.FeatureInteractionTransformer(features_1st=fe_1st, features_2nd=fe_2nd)),
                    ('RareCategory', FE.RareCategoryEncoder(category_min_pct=0.01, category_max_count=20)),
                    ('ORDINAL', FE.UniversalCategoryEncoder(encoding_method='ordinal', prefix='CAT'))
                    ])

NUM_Pipe          = Pipeline([
                    ('Selector', FE.FeatureSelector_NUM()),
                    ('Original', FE.PassTransformer(prefix='NUM'))
                    ])

FE_Pipe          =  FE.FeatureUnion_DF([('Pipe_CAT', CAT_Pipe), ('Pipe_NUM', NUM_Pipe)])

# (2) fit()
FE_Pipe.fit(train_X, train_y)


# (3) transform()
train_X_FE = FE_Pipe.transform(train_X)
test_X_FE  = FE_Pipe.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Feature Engineering (FE)\n' + '*'*50)
print('TRAIN: Before FE:' + str(train_X.shape))
print('TRAIN: After FE: ' + str(train_X_FE.shape))
print('TEST:  After FE: ' + str(test_X_FE.shape))

# **************************************************
# Before vs After Feature Engineering (FE)
# **************************************************
# TRAIN: Before FE:(1662348, 1038)
# TRAIN: After FE: (1662348, 1097)
# TEST:  After FE: (1656204, 1097)
# CPU times: user 1h 17min 40s, sys: 43min 21s, total: 2h 1min 2s
# Wall time: 2h 53s

'ordinal' encoding requires target y.

**************************************************
Before vs After Feature Engineering (FE)
**************************************************
TRAIN: Before FE:(1662348, 1038)
TRAIN: After FE: (1662348, 1098)
TEST:  After FE: (1656204, 1098)
CPU times: user 1h 18min 7s, sys: 48min 57s, total: 2h 7min 5s
Wall time: 2h 6min 58s


### Correlation Summary: TRAIN vs TEST

In [7]:
p_list          = [.01, .05, .1, .2, .3, .4, .6, .7, .8, .9, .95, .99]
corr_train_all  = train_X_FE.apply(lambda x: x.corr(train_y)).to_frame().describe(percentiles=p_list)
corr_test_all   = test_X_FE.apply(lambda x: x.corr(test_y)).to_frame().describe(percentiles=p_list)

corr_all         = pd.concat([corr_train_all, corr_test_all], axis=1)
corr_all.columns = ['TRAIN_All', 'TEST_All']
print('\n' + '*'*70 + '\nCorrelation Summary: TRAIN vs TEST\n' + '*'*70)
corr_all


**********************************************************************
Correlation Summary: TRAIN vs TEST
**********************************************************************


Unnamed: 0,TRAIN_All,TEST_All
count,1058.0,1055.0
mean,0.006401,0.006813
std,0.018414,0.016706
min,-0.046872,-0.048314
1%,-0.038063,-0.032714
5%,-0.02715,-0.01766
10%,-0.012608,-0.007807
20%,-0.001767,-0.001634
30%,6e-05,-7.2e-05
40%,0.000903,0.001046


### Saving Transformed Categorical Features

In [8]:
df_train_FE = train_y.to_frame().\
              merge(train_X_FE, how='inner', left_index=True, right_index=True)

df_test_FE  = test_y.to_frame().\
              merge(test_X_FE, how='inner', left_index=True, right_index=True)

df_train_FE.to_pickle('data_new/CLUSTER_df_train_FE.pkl')
df_test_FE.to_pickle('data_new/CLUSTER_df_test_FE.pkl')

In [37]:
%%time
fe   = 'CAT_competitive_area'
tmp1 = df_train_FE.groupby(fe)['status'].mean()
tmp2 = df_test_FE.groupby(fe)['status'].mean()
print(pd.concat([tmp1, tmp2], axis=1))

map1 = FE_Pipe.transformer_fitted[0].named_steps['ORDINAL'].category_mapper_['competitive_area']
map2 = {i:j for j, i in map1.items()}
print(map1)
print(map2)

                        status    status
CAT_competitive_area                    
0                     0.008417  0.007479
1                     0.009252  0.008077
2                     0.016300  0.015230
3                     0.017105  0.017094
4                     0.031425  0.034022
{'Non-Competitive Area': 0, 'U-verse Competitive Area': 1, 'Pre-Fios Competitive Area': 2, 'Fios Competitive Area': 3, 'Fios ONT Competitive Area': 4}
{0: 'Non-Competitive Area', 1: 'U-verse Competitive Area', 2: 'Pre-Fios Competitive Area', 3: 'Fios Competitive Area', 4: 'Fios ONT Competitive Area'}
CPU times: user 56 ms, sys: 0 ns, total: 56 ms
Wall time: 54.9 ms


In [35]:
# https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict

CLUSTER_category_mapper = FE_Pipe.transformer_fitted[0].named_steps['ORDINAL'].category_mapper_
# print(CLUSTER_category_mapper)

with open('data_new/CLUSTER_category_mapper.pkl', 'wb') as handle:
    pickle.dump(CLUSTER_category_mapper, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('data_new/CLUSTER_category_mapper.pkl', 'rb') as handle:
#     b = pickle.load(handle)

# print(CLUSTER_category_mapper == b)
# print(b)

True
{'GRP_CALL_BILL-age_demos': {'0 Call___65-74': 0, '0 Call___55-64': 1, '1 Calls___65-74': 2, '0 Call___45-54': 3, '1 Calls___55-64': 4, '2 Calls___55-64': 5, '1 Calls___45-54': 6, '1 Calls___75+': 7, '0 Call___75+': 8, 'all_other': 9, '0 Call___Unknown': 10, '0 Call___35-44': 11, '0 Call___18-24': 12, '1 Calls___35-44': 13, '1 Calls___Unknown': 14, '1 Calls___25-34': 15, '0 Call___25-34': 16}, 'GRP_CALL_BILL-archetype': {'0 Call___Indulgent Empty Nesters': 0, '1 Calls___Indulgent Empty Nesters': 1, '2 Calls___Indulgent Empty Nesters': 2, '0 Call___Busy Families Achievers': 3, '1 Calls___Busy Families Achievers': 4, '0 Call___Empty Nesters': 5, '1 Calls___Empty Nesters': 6, '0 Call___Movers and Shakers': 7, '2 Calls___Empty Nesters': 8, 'all_other': 9, '0 Call___Busy Families Dreamers': 10, '0 Call___Tech Sensible': 11, '0 Call___Industrious Entertain-Mes': 12, '0 Call___Tech Savvy Singles': 13, '1 Calls___Industrious Entertain-Mes': 14, '1 Calls___Tech Savvy Singles': 15}, 'GRP_CA

In [10]:
features_NUM = [fe for fe in df_train_FE.columns if 'NUM' in fe]
features_CAT = [fe for fe in df_train_FE.columns if 'CAT' in fe]

features_NUM.sort()
features_CAT.sort()

print(len(features_NUM))
# print(features_NUM)

print(len(features_CAT))
# print(features_CAT)

print(df_train_FE.columns[:10])
df_train_FE.isnull().sum().describe()

621
477
Index(['status', 'CAT_GRP_CALL_BILL-age_demos', 'CAT_GRP_CALL_BILL-archetype',
       'CAT_GRP_CALL_BILL-ethnic', 'CAT_GRP_CALL_BILL-grp_call_csc',
       'CAT_GRP_CALL_BILL-grp_call_csr', 'CAT_GRP_CALL_BILL-grp_call_tsr',
       'CAT_GRP_CALL_BILL-grp_payment_25dollar',
       'CAT_GRP_CALL_BILL-grp_payment_change_10dollar',
       'CAT_GRP_CALL_BILL-grp_payment_change_5pct'],
      dtype='object')


count    1099.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
dtype: float64