# Create TRAIN/TEST Datasets for Model Building

## 0. Load Required Packages

In [1]:
### Default Packages
import pandas as pd
import pandas_gbq as gbq
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
%matplotlib inline
import pickle
# import os
import string
# import matplotlib.cm as cm
# from sklearn.externals import joblib


### Pipeline Packages
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer


### Custom Transformers
import CLUSTER_FeatureEngineering as FE                        # Include 'competitive_area' as a feature
from Class_GeneralUtilitiesNonTF import GeneralUtilitiesNonTF  # For BQ-Storage-Python Connection


### Dictionaries for Data Type and Missing Value Imputation
from attribute_dictionary import attribute_dict
from imputation_dictionary import attribute_imputer_dict


### Remove DataConversionWarning
import warnings
warnings.simplefilter('ignore')

## 1. Import TRAIN/TEST Datasets from BigQuery

### Importing TRAIN/TEST Datasets

#### Key Information

In [2]:
# import datetime
# import psycopg2
# print('*'*25 + "\nProgram Start Time:")
# print(datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + '\n'+'*'*25)

# Do Not Change the Below Information Unless Use Others!
bucket     = 'alticeusa-am'
project_id = 'alticeusa-am'
dataset    = 'poc'
auth_file  = 'alticeusa-am-b639e404289b.json'

#Instantiate the util obj that will be used to interact with BQ
util_obj   = GeneralUtilitiesNonTF(project_id  = project_id, 
                                   dataset     = dataset,
                                   bucket_name = bucket,
                                   json_path   = auth_file)

In [3]:
%%time
# TRAIN Dataset
sql_data = ''' SELECT * FROM poc.TK_SEG1_TRAIN_RAW; ''' 
df_train = util_obj.read_gbq(sql_query = sql_data)
print('\n', '*'*65, '\n', f"TRAIN data is SUCCESSFULLY imported! Number of records = {str(len(df_train))}." \
      + '\n', '*'*65, '\n')
               
# TEST Dataset
sql_data = ''' SELECT * FROM poc.TK_SEG1_TEST_RAW; ''' 
df_test  = util_obj.read_gbq(sql_query = sql_data)
print('\n', '*'*65, '\n', f"TEST data is SUCCESSFULLY imported! Number of records = {str(len(df_test))}." \
      + '\n', '*'*65, '\n')

# CPU times: user 35.7 s, sys: 3.8 s, total: 39.5 s
# Wall time: 4min

Temp table to be created: temp_gcs_05_24_19_14_49_43
**********
Query executed: 
 SELECT * FROM poc.TK_SEG1_TRAIN_RAW; 
Temp table temp_gcs_05_24_19_14_49_43 created
**********
Info query: SELECT table_name,column_name,data_type,is_nullable FROM poc.INFORMATION_SCHEMA.COLUMNS WHERE table_name='temp_gcs_05_24_19_14_49_43'
Extraction to the tmp_folder_05_24_19_14_49_43 complete
Prefix: tmp_folder_05_24_19_14_49_43/tmp_file_05_24_19_14_49_43
Pulling file: tmp_folder_05_24_19_14_49_43/tmp_file_05_24_19_14_49_43-000000000000.csv.gzip
Pulling file: tmp_folder_05_24_19_14_49_43/tmp_file_05_24_19_14_49_43-000000000001.csv.gzip
Deleted the temp folder tmp_folder_05_24_19_14_49_43

 ***************************************************************** 
 TRAIN data is SUCCESSFULLY imported! Number of records = 99608.
 ***************************************************************** 

Temp table to be created: temp_gcs_05_24_19_14_51_58
**********
Query executed: 
 SELECT * FROM poc.TK_SEG1_TEST_RAW;

In [4]:
# Use 'chc_id' as index, and sort by index.
df_train.set_index('chc_id', inplace=True)
df_test.set_index('chc_id', inplace=True)

df_train = df_train.sort_index()
df_test  = df_test.sort_index()

# TRAIN
train_X  = df_train.drop('status', axis=1).copy()
train_y  = df_train['status']

# TEST
test_X   = df_test.drop('status', axis=1).copy()
test_y   = df_test['status']

# Sample Size
print('*'*50 + '\nTRAIN vs TEST Datasets\n' + '*'*50)
print('Competitive Area: ', df_train.competitive_area.unique())
print('The Shape of TRAIN Data: ' + str(df_train.shape))
print('The Shape of TEST Data:  ' + str(df_test.shape))

## Churn Rate by Sample Type
print('\n' + '*'*50 + '\nOverall Churn Rate\n' + '*'*50)
print('TRAIN: ', df_train.status.value_counts(normalize=True)[1].round(3))
print('TEST:  ', df_test.status.value_counts(normalize=True)[1].round(3), '\n')

# print(train_X.index)
# print(train_y.index)
# print(test_X.index)
# print(test_y.index)

**************************************************
TRAIN vs TEST Datasets
**************************************************
Competitive Area:  ['Fios Competitive Area' 'Fios ONT Competitive Area'
 'Pre-Fios Competitive Area']
The Shape of TRAIN Data: (99608, 1041)
The Shape of TEST Data:  (94993, 1041)

**************************************************
Overall Churn Rate
**************************************************
TRAIN:  0.038
TEST:   0.036 



## 2. Pre-Process TRAIN/TEST Datasets

### Pre-Processing Data

In [5]:
%%time

# (1) Make a Pipeline and Instantiate
Pipe_PP = Pipeline([
                    ('DataType', FE.Use_DefaultDataType(default_dtypes=attribute_dict)),
                    ('Missing', FE.Remove_MissingFeatures(missing_threshold=0.99)), 
                    ('Constant1', FE.Remove_ConstantFeatures(unique_threshold=1, missing_threshold=0.00)), 
                    ('Correlated1', FE.Remove_CorrelatedFeatures(correlation_threshold=0.99)), 
                    ('Duplicate', FE.Remove_DuplicateFeatures()),
                    ('Imputer', FE.Use_DefaultImputer(default_imputers=attribute_imputer_dict, default_dtypes=attribute_dict)),
                    ('Constant2', FE.Remove_ConstantFeatures(unique_threshold=1, missing_threshold=0.00)), 
                    ('Correlated2', FE.Remove_CorrelatedFeatures(correlation_threshold=0.90))
                  ])

# 'Constant2' is added to handle (1) unique value = 0 and (2) default imputation value = 0.
# 'Correlated2' is added to further remove correlated features after impuation.


# (2) fit()
Pipe_PP.fit(train_X, train_y)


# (3) transform()
train_X_PP = Pipe_PP.transform(train_X)
test_X_PP  = Pipe_PP.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Transformation\n' + '*'*50)
print('TRAIN: Before Transformation:' + str(train_X.shape))
print('TRAIN: After Transformation: ' + str(train_X_PP.shape))
print('TEST:  After Transformation: ' + str(test_X_PP.shape))

# 25 features with greater than 99.0% missing values
# 26 features with 1 or fewer unique value(s)
# 43 features with abs(correlation ) > 0.99 with other features
# 4 features with duplicate columns
# 17 features with 1 or fewer unique value(s)
# 124 features with abs(correlation ) > 0.9 with other features

# **************************************************
# Before vs After Transformation
# **************************************************
# TRAIN: Before Transformation:(99608, 1040)
# TRAIN: After Transformation: (99608, 787)
# TEST:  After Transformation: (94993, 787)
        
# CPU times: user 26min 18s, sys: 9min 10s, total: 35min 29s
# Wall time: 35min 29s

**************************************************
Pre-Processing: Use_DefaultDataType
**************************************************
- It will convert data types into default ones.

**************************************************
Pre-Processing: Remove_MissingFeatures
**************************************************
- It will remove features with a high missing pct.

**************************************************
Pre-Processing: Remove_ConstantFeatures
**************************************************
- It will remove features with 1 unique value(s).

**************************************************
Pre-Processing: Remove_CorrelatedFeatures
**************************************************
- It will work on Numerical Features Only, doing nothing on Categorical Features.
- It may take 10+ minutes. Be patient!

**************************************************
Pre-Processing: Remove_DuplicateFeatures
**************************************************
- It may take 10+ 

### Creating New Features

In [6]:
%%time

# (1) Make a Pipeline and Instantiate
Pipe_NF = Pipeline([
                    ('Imputer', FE.Use_DefaultImputer(default_imputers=attribute_imputer_dict, default_dtypes=attribute_dict)),
                    ('NewFeatures', FE.FeatureMaker())
                  ])

# 'Imputer' is added to handle missing values
# 'NewFeature' is added to create new features


# (2) fit()
Pipe_NF.fit(train_X, train_y)


# (3) transform()
train_X_NF = Pipe_NF.transform(train_X)
test_X_NF  = Pipe_NF.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Transformation\n' + '*'*50)
print('TRAIN: Before Transformation:' + str(train_X.shape))
print('TRAIN: After Transformation: ' + str(train_X_NF.shape))
print('TEST:  After Transformation: ' + str(test_X_NF.shape))
print('\n' + '*'*50 + '\nNewly Created Features\n' + '*'*50 + '\n', 
      Pipe_NF.named_steps['NewFeatures'].features_new_)

# CPU times: user 9min, sys: 6min 19s, total: 15min 20s
# Wall time: 15min 16s

**************************************************
Pre-Processing: Use_DefaultImputere
**************************************************
- It will append default imputation values to missings.


**************************************************
Before vs After Transformation
**************************************************
TRAIN: Before Transformation:(99608, 1040)
TRAIN: After Transformation: (99608, 16)
TEST:  After Transformation: (94993, 16)

**************************************************
Newly Created Features
**************************************************
 ['grp_tenure_3m', 'grp_tenure_1m', 'grp_tenure_6m', 'grp_payment_method', 'grp_payment_25dollar', 'grp_payment_10dollar', 'grp_payment_change_5dollar', 'grp_payment_change_10dollar', 'grp_payment_change_2pct', 'grp_payment_change_5pct', 'ratio_payment_income', 'grp_payment_income', 'grp_call_csc', 'grp_call_bill', 'grp_call_csr', 'grp_call_tsr']
CPU times: user 8min 55s, sys: 6min 48s, total: 15min 43s
Wall time: 15

### Combining Processed and New Features

In [7]:
# Create Datasets that Consist of Pre-processed and New Features.
df_train_NF_PP = train_y.to_frame().\
                 merge(train_X_NF, how='inner', left_index=True, right_index=True).\
                 merge(train_X_PP, how='inner', left_index=True, right_index=True)

df_test_NF_PP  = test_y.to_frame().\
                 merge(test_X_NF, how='inner', left_index=True, right_index=True).\
                 merge(test_X_PP, how='inner', left_index=True, right_index=True)

# # Save Data for Feature Engineering
# # Pre-processed data with new features
df_train_NF_PP.to_pickle('data_new/SEG1_train_PP.pkl')
df_test_NF_PP.to_pickle('data_new/SEG1_test_PP.pkl')

print(df_train_NF_PP.shape)
print(df_test_NF_PP.shape)

(99608, 804)
(94993, 804)


## 3. Feature Engineering

### FE for NUMERICAL Features

In [8]:
# Use Pre-Processed Data as new TRAIN/TEST Datasets
train_X = train_X_NF.merge(train_X_PP, how='inner', left_index=True, right_index=True)
test_X  = test_X_NF.merge(test_X_PP, how='inner', left_index=True, right_index=True)
print(train_X.shape)
print(train_X.shape)

(99608, 803)
(99608, 803)


In [9]:
%%time

# (1) Make a Pipeline in Parallel/Sequence and Instantiate 
# Custom Transformers in Parallel for NUMERICAL Features
Pipe_FU          =  FE.FeatureUnion_DF([
                    ('Original', FE.PassTransformer(prefix='Original')),
                    ('Standard', FE.StandardScaler_DF(prefix='Standard')),
                    ('Robust', FE.RobustScaler_DF(prefix='Robust', quantile_range=(5.0, 95.0))),
                    ('Quantile', FE.QuantileTransformer_DF(prefix='Quantile', n_quantiles=100, random_state=0)),
                    ('Binary', FE.Binarizer_DF(prefix='Binary', threshold=0)),
                    ('MinMax', FE.MinMaxScaler_DF(prefix='MinMax', feature_range=(0, 1))),
                    ('MaxAbs', FE.MaxAbsScaler_DF(prefix='MaxAbs')),
                    ('Norm', FE.Normalizer_DF(prefix='Norm', norm='l1')),
                    ('KBin', FE.KBinsDiscretizer_DF(prefix='KBin', n_bins=10, encode='ordinal')),
                    ('Log1p', FE.Log1pTransformer(prefix='Log1p')),
                    ('Sqrt', FE.SqrtTransformer(prefix='Sqrt')),
                    ('Reciprocal', FE.ReciprocalTransformer(prefix='Reciprocal'))
                    ])

# Custom Transformers in Sequence for NUMERICAL Features
NUM_Pipe          = Pipeline([
                    ('Selector', FE.FeatureSelector_NUM()),
                    ('FU_Pipe', Pipe_FU)
                    ])


# (2) fit()
NUM_Pipe.fit(train_X, train_y)


# (3) transform()
train_X_FE = NUM_Pipe.transform(train_X)
test_X_FE  = NUM_Pipe.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Feature Engineering (FE)\n' + '*'*50)
print('TRAIN: Before FE:' + str(train_X.shape))
print('TRAIN: After FE: ' + str(train_X_FE.shape))
print('TEST:  After FE: ' + str(test_X_FE.shape))


**************************************************
Before vs After Feature Engineering (FE)
**************************************************
TRAIN: Before FE:(99608, 803)
TRAIN: After FE: (99608, 4980)
TEST:  After FE: (94993, 4980)
CPU times: user 26.2 s, sys: 15.3 s, total: 41.5 s
Wall time: 41.5 s


In [10]:
p_list          = [.01, .05, .1, .2, .3, .4, .6, .7, .8, .9, .95, .99]
flag_NUM        = train_X.select_dtypes(exclude=[object, 'category']).columns.tolist()
corr_train      = train_X[flag_NUM].apply(lambda x: x.corr(train_y)).to_frame().describe(percentiles=p_list)
corr_train_all  = train_X_FE.apply(lambda x: x.corr(train_y)).to_frame().describe(percentiles=p_list)

corr_test       = test_X[flag_NUM].apply(lambda x: x.corr(test_y)).to_frame().describe(percentiles=p_list)
corr_test_all   = test_X_FE.apply(lambda x: x.corr(test_y)).to_frame().describe(percentiles=p_list)

corr_all         = pd.concat([corr_train, corr_train_all, corr_test, corr_test_all], axis=1)
corr_all.columns = ['TRAIN_Original', 'TRAIN_All', 'TEST_Original', 'TEST_All']
print('\n' + '*'*50 + '\nCorrelation Summary: TRAIN vs TEST\n' + '*'*50)
corr_all


**************************************************
Correlation Summary: TRAIN vs TEST
**************************************************


Unnamed: 0,TRAIN_Original,TRAIN_All,TEST_Original,TEST_All
count,415.0,4733.0,413.0,4716.0
mean,-0.00056,-0.000586,0.0014,0.00136
std,0.007395,0.007191,0.007841,0.007825
min,-0.032097,-0.034218,-0.029401,-0.035188
1%,-0.021667,-0.0214,-0.019741,-0.016083
5%,-0.011885,-0.011783,-0.007485,-0.008029
10%,-0.008424,-0.008049,-0.006103,-0.006029
20%,-0.004463,-0.004479,-0.003429,-0.003604
30%,-0.003241,-0.003061,-0.001777,-0.002216
40%,-0.001967,-0.001983,-0.000833,-0.000998


In [11]:
df_train_FE_NUM = train_y.to_frame().\
                 merge(train_X_FE, how='inner', left_index=True, right_index=True)

df_test_FE_NUM  = test_y.to_frame().\
                  merge(test_X_FE, how='inner', left_index=True, right_index=True)

df_train_FE_NUM.to_pickle('data_new/SEG1_train_FE_NUM.pkl')
df_test_FE_NUM.to_pickle('data_new/SEG1_test_FE_NUM.pkl')

### FE for CATEGORICAL Features

In [12]:
%%time

# (1) Make a Pipeline in Parallel/Sequence and Instantiate 
# List of Features Used as Parameters
fe_1st           = ['grp_tenure_3m', 'grp_payment_method', \
                    'grp_payment_25dollar', 'grp_payment_change_10dollar', 'grp_payment_change_5pct', \
                    'grp_payment_income', 'grp_call_csc', 'grp_call_bill', \
                    'grp_call_csr', 'grp_call_tsr']
fe_2nd           = fe_1st + ['income_demos', 'ethnic', 'age_demos', 'archetype']
fe_group         = ['census', 'cleansed_city', 'cleansed_zipcode']

# Custom Transformers in Parallel for CATEGORICAL Features
Pipe_FU          =  FE.FeatureUnion_DF([
                    ('OHE', FE.UniversalCategoryEncoder(encoding_method='ohe')),
                    ('PCT', FE.UniversalCategoryEncoder(encoding_method='pct', prefix='PCT')),
                    ('COUNT', FE.UniversalCategoryEncoder(encoding_method='count', prefix='COUNT')),
                    ('ORDINAL', FE.UniversalCategoryEncoder(encoding_method='ordinal', prefix='ORDINAL')),
                    ('Y_MEAN', FE.UniversalCategoryEncoder(encoding_method='y_mean', prefix='Y_MEAN')),
                    ('Y_LOG_RATIO', FE.UniversalCategoryEncoder(encoding_method='y_log_ratio', prefix='Y_LOG_RATIO')),
                    ('Y_RATIO', FE.UniversalCategoryEncoder(encoding_method='y_ratio', prefix='Y_RATIO')),
                    ('Aggregation', FE.FeatureAggregator(features_grouping=fe_group, correlation_threshold=0.01))
                    ])

# Custom Transformers in Sequence for CATEGORICAL Features
CAT_Pipe          = Pipeline([
                    ('Interaction', FE.FeatureInteractionTransformer(features_1st=fe_1st, features_2nd=fe_2nd)),
                    ('RareCategory', FE.RareCategoryEncoder(category_min_pct=0.01, category_max_count=30)),
                    ('FU_Pipe', Pipe_FU)
                    ])

# (2) fit()
CAT_Pipe.fit(train_X, train_y)


# (3) transform()
train_X_FE = CAT_Pipe.transform(train_X)
test_X_FE  = CAT_Pipe.transform(test_X)

# Feature Dimension
print('\n' + '*'*50 + '\nBefore vs After Feature Engineering (FE)\n' + '*'*50)
print('TRAIN: Before FE:' + str(train_X.shape))
print('TRAIN: After FE: ' + str(train_X_FE.shape))
print('TEST:  After FE: ' + str(test_X_FE.shape))

# **************************************************
# Before vs After Feature Engineering (FE)
# **************************************************
# TRAIN: Before FE:(99608, 803)
# TRAIN: After FE: (99608, 11396)
# TEST:  After FE: (94993, 11396)

# **********************************************************************
# Correlation Summary: TRAIN vs TEST
# **********************************************************************
# CPU times: user 38min, sys: 22min 56s, total: 1h 57s
# Wall time: 1h 57s

'ordinal' encoding requires target y.
'y_mean' encoding requires target y.
'y_log_ratio' encoding requires target y.
'y_ratio' encoding requires target y.
'FeatureAggregator' requires target y.

**************************************************
Before vs After Feature Engineering (FE)
**************************************************
TRAIN: Before FE:(99608, 803)
TRAIN: After FE: (99608, 11396)
TEST:  After FE: (94993, 11396)
CPU times: user 39min 23s, sys: 23min 30s, total: 1h 2min 54s
Wall time: 1h 2min 54s


In [13]:
p_list          = [.01, .05, .1, .2, .3, .4, .6, .7, .8, .9, .95, .99]
corr_train_all  = train_X_FE.apply(lambda x: x.corr(train_y)).to_frame().describe(percentiles=p_list)
corr_test_all   = test_X_FE.apply(lambda x: x.corr(test_y)).to_frame().describe(percentiles=p_list)

corr_all         = pd.concat([corr_train_all, corr_test_all], axis=1)
corr_all.columns = ['TRAIN_All', 'TEST_All']
print('\n' + '*'*70 + '\nCorrelation Summary: TRAIN vs TEST\n' + '*'*70)
corr_all


**********************************************************************
Correlation Summary: TRAIN vs TEST
**********************************************************************


Unnamed: 0,TRAIN_All,TEST_All
count,11309.0,11233.0
mean,0.00126,-0.000473
std,0.014213,0.016353
min,-0.028932,-0.051032
1%,-0.020498,-0.029553
5%,-0.017811,-0.024317
10%,-0.017302,-0.023719
20%,-0.012875,-0.015811
30%,-0.009599,-0.007325
40%,-0.002573,-0.003217


In [14]:
df_train_FE_CAT = train_y.to_frame().\
                  merge(train_X_FE, how='inner', left_index=True, right_index=True)

df_test_FE_CAT  = test_y.to_frame().\
                  merge(test_X_FE, how='inner', left_index=True, right_index=True)

# df_train_FE_CAT.to_pickle('data_new/SEG1_train_FE_CAT.pkl')
# df_test_FE_CAT.to_pickle('data_new/SEG1_test_FE_CAT.pkl')
train_X_FE.to_pickle('data_new/SEG1_train_FE_CAT.pkl')
test_X_FE.to_pickle('data_new/SEG1_test_FE_CAT.pkl')

In [21]:
train_X_FE.to_pickle('data_new/SEG1_train_FE_CAT.pkl')
test_X_FE.to_pickle('data_new/SEG1_test_FE_CAT.pkl')

In [20]:
test_X_FE.head()

Unnamed: 0_level_0,GRP_CALL_BILL-age_demos_0 Call___Unknown,GRP_CALL_BILL-age_demos_0 Call___25-34,GRP_CALL_BILL-age_demos_0 Call___45-54,GRP_CALL_BILL-age_demos_all_other,GRP_CALL_BILL-age_demos_0 Call___35-44,GRP_CALL_BILL-age_demos_0 Call___55-64,GRP_CALL_BILL-age_demos_1 Calls___Unknown,GRP_CALL_BILL-age_demos_1 Calls___25-34,GRP_CALL_BILL-age_demos_0 Call___65-74,GRP_CALL_BILL-age_demos_1 Calls___45-54,...,CLEANSED_ZIPCODE__*__videopromo_rng_m3__*__e: 14-15,CLEANSED_ZIPCODE__*__videopromo_rng_m3__*__f: 16-21,CLEANSED_ZIPCODE__*__videopromo_rng_m3__*__g: 22+,CLEANSED_ZIPCODE__*__videopromo_rng_m3__*__x: None,CLEANSED_ZIPCODE__*__videopromo_rng_m4__*__f: 16-21,CLEANSED_ZIPCODE__*__videopromo_rng_m4__*__x: None,CLEANSED_ZIPCODE__*__voluntary_discos__*__max,CLEANSED_ZIPCODE__*__voluntary_discos__*__mean,CLEANSED_ZIPCODE__*__voluntary_discos__*__std,CLEANSED_ZIPCODE__*__voluntary_discos__*__sum
chc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
780100001104,0,1,0,0,0,0,0,0,0,0,...,0.151111,0.2,0.057778,0.148889,0.275556,0.151111,3.0,0.428889,0.74652,193.0
780100002602,0,0,0,0,0,1,0,0,0,0,...,0.151111,0.2,0.057778,0.148889,0.275556,0.151111,3.0,0.428889,0.74652,193.0
780100011404,0,0,0,0,0,0,0,0,0,0,...,0.151111,0.2,0.057778,0.148889,0.275556,0.151111,3.0,0.428889,0.74652,193.0
780100017405,0,0,0,0,0,0,0,0,0,1,...,0.054422,0.22449,0.068027,0.142857,0.238095,0.163265,3.0,0.265306,0.57727,39.0
780100029602,0,0,1,0,0,0,0,0,0,0,...,0.151111,0.2,0.057778,0.148889,0.275556,0.151111,3.0,0.428889,0.74652,193.0


# 4. Remove Highly Correlated Features

In [15]:
# # Use Feature Engineered Data as new TRAIN/TEST Datasets
# train_X = df_train_FE_NUM.merge(train_X_FE, how='inner', left_index=True, right_index=True).drop('status', axis=1)
# test_X  = df_test_FE_NUM.merge(test_X_FE, how='inner', left_index=True, right_index=True).drop('status', axis=1)
# print(train_X.shape)
# print(train_X.shape)

In [16]:
# %%time

# # (1) Instantiate
# # default correlation_threshold=0.90    
# PP_Correlated = FE.Remove_CorrelatedFeatures(correlation_threshold=0.90)


# # (2) fit()
# # default: y=None
# # Note: data = train_X_Missing
# PP_Correlated.fit(train_X, train_y)

# # list
# print('\n' + '*'*50 + '\nFeatures Dropped Due to Multicollinearity\n' + '*'*50 + '\n', 
#       PP_Correlated.features_dropped_)

# # dataframe
# summary_dropped_Correlated     = PP_Correlated.summary_dropped_


# # (3) transform()
# train_X_Correlated = PP_Correlated.transform(train_X)
# test_X_Correlated  = PP_Correlated.transform(test_X)

# # Feature Dimension
# print('\n' + '*'*50 + '\nBefore vs After Transformation\n' + '*'*50)
# print('TRAIN: Before Transformation:' + str(train_X.shape))
# print('TRAIN: After Transformation: ' + str(train_X_Correlated.shape))
# print('TEST:  After Transformation: ' + str(test_X_Correlated.shape))

In [17]:
# summary_dropped_Correlated.head(n=20)