# Feature Selection for Model Classification

## 1. TRAIN/TEST Datasets

### Required Packages

In [1]:
%%time

### 0. Import Required Packages
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
%matplotlib inline

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

### Import Customer Transformers
# Include 'competitive_area' as a feature
import CLUSTER_FeatureEngineering as FE

# # Use the Updated Attribute/Imputation Dictionaries!!!
# %run 'data_new/attribute_dictionary.py'
# %run 'data_new/imputation_dictionary.py'

# from sklearn.model_selection import train_test_split
# from sklearn.tree import DecisionTreeClassifier
# # from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_auc_score
# from sklearn.preprocessing import LabelEncoder
# from sklearn.feature_selection import chi2, f_classif, mutual_info_classif,\
#                                       SelectKBest, SelectPercentile, SelectFromModel
# #                                     f_regression, mutual_info_regression, RFE


# Remove DataConversionWarning
import warnings
warnings.simplefilter('ignore')

CPU times: user 1.69 s, sys: 1.84 s, total: 3.53 s
Wall time: 20.2 s


### Feature Engineered TRAIN/TEST

In [2]:
%%time
with open('data_new/CLUSTER_category_mapper.pkl', 'rb') as handle:
    CLUSTER_category_mapper = pickle.load(handle)

df_train = pd.read_pickle('data_new/CLUSTER_df_train_FE.pkl')
df_test  = pd.read_pickle('data_new/CLUSTER_df_test_FE.pkl')

# TRAIN
train_X  = df_train.drop('status', axis=1).copy()
train_y  = df_train['status']

# TEST
test_X   = df_test.drop('status', axis=1).copy()
test_y   = df_test['status']

# Sample Size
print('*'*50 + '\nTRAIN vs TEST Datasets\n' + '*'*50)
# print('Competitive Area: ', df_train.competitive_area.unique())
print('The Shape of TRAIN Data: ' + str(df_train.shape))
print('The Shape of TEST Data:  ' + str(df_test.shape))

## Churn Rate by Sample Type
print('\n' + '*'*50 + '\nOverall Churn Rate\n' + '*'*50)
print('TRAIN: ', df_train.status.value_counts(normalize=True)[1].round(4))
print('TEST:  ', df_test.status.value_counts(normalize=True)[1].round(4), '\n')

# print(df_train.shape)
# print(df_train.index)

# print(df_test.shape)
# print(df_test.index)

del df_train, df_test

**************************************************
TRAIN vs TEST Datasets
**************************************************
The Shape of TRAIN Data: (1662348, 1099)
The Shape of TEST Data:  (1656204, 1099)

**************************************************
Overall Churn Rate
**************************************************
TRAIN:  0.0154
TEST:   0.0153 

CPU times: user 29.6 s, sys: 1min 45s, total: 2min 15s
Wall time: 6min 17s


## 2. Feature Selection

### Tree-Based Feature Selection

In [3]:
%%time

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

features_ALL = train_X.columns.tolist()
features_ALL.sort()

roc_auc      = []

for fe in features_ALL:
    clf      = DecisionTreeClassifier()
    clf.fit(train_X[fe].to_frame(), train_y)
    pred_y   = clf.predict_proba(test_X[fe].to_frame())
    roc_auc.append(roc_auc_score(test_y, pred_y[:, 1]))
#     print(f'{fe}: {roc_auc_score(test_y, pred_y[:, 1])}')

result_tree  = pd.DataFrame({'feature': features_ALL, 'roc_auc': roc_auc})
result_tree  = result_tree.sort_values('roc_auc', ascending=False)
result_tree['ranking_tree'] = np.arange(1, result_tree.shape[0]+1, 1)

# CPU times: user 34min 54s, sys: 1min 3s, total: 35min 57s
# Wall time: 35min 59s

CPU times: user 32min 41s, sys: 52.1 s, total: 33min 33s
Wall time: 33min 35s


### Univariate Feature Selection

In [4]:
%%time

import sklearn.feature_selection as FS

features_ALL = train_X.columns.tolist()
features_NUM = [fe for fe in train_X.columns if 'NUM' in fe]
features_CAT = [fe for fe in test_X.columns if 'CAT' in fe]

features_ALL.sort()
features_NUM.sort()
features_CAT.sort()

### ANOVA test for ALL Features
anova, pval  = FS.f_classif(train_X[features_ALL], train_y)
result_anova = pd.DataFrame({'feature': features_ALL, 'anova': anova, 'pval_anova': pval})
result_anova = result_anova.sort_values('anova', ascending=False)
result_anova['ranking_anova'] = np.arange(1, result_anova.shape[0]+1, 1)


### chi-2 test for CAT Features
chi2, pval   = FS.chi2(train_X[features_CAT], train_y)
result_chi2  = pd.DataFrame({'feature': features_CAT, 'chi2': chi2, 'pval_chi2': pval})
result_chi2  = result_chi2.sort_values('chi2', ascending=False)
result_chi2['ranking_chi2'] = np.arange(1, result_chi2.shape[0]+1, 1)


### Correlation for All Features
corr_train   = train_X[features_ALL].apply(lambda x: x.corr(train_y))
corr_test    = test_X[features_ALL].apply(lambda x: x.corr(test_y))
result_corr  = pd.DataFrame({'corr_train': corr_train, 'corr_test': corr_test}).\
               reset_index().rename(columns={'index': 'feature'})
result_corr['corr_train_abs'] = result_corr.corr_train.abs()
result_corr                   = result_corr.sort_values('corr_train_abs', ascending=False)
result_corr['ranking_corr']   = np.arange(1, result_corr.shape[0]+1, 1)
result_corr.drop('corr_train_abs', axis=1, inplace=True)

CPU times: user 2min 4s, sys: 1min 13s, total: 3min 18s
Wall time: 3min 18s


In [9]:
result_corr

Unnamed: 0,feature,corr_train,corr_test,ranking_corr
195,CAT_grp_tenure_1m,0.059781,0.059305,1
196,CAT_grp_tenure_3m,0.059031,0.060119,2
197,CAT_grp_tenure_6m,0.058728,0.060893,3
53,CAT_GRP_PAYMENT_25DOLLAR-grp_tenure_3m,0.053701,0.054279,4
11,CAT_GRP_CALL_BILL-grp_tenure_3m,0.053670,0.054585,5
44,CAT_GRP_CALL_TSR-grp_tenure_3m,0.053311,0.053101,6
374,CAT_restarttengrp,0.052166,0.053040,7
61,CAT_GRP_PAYMENT_CHANGE_10DOLLAR-grp_tenure_3m,0.051986,0.053055,8
81,CAT_GRP_TENURE_3M-age_demos,0.051741,0.050464,9
82,CAT_GRP_TENURE_3M-archetype,0.051451,0.052368,10


### Univariate Feature Selection: Mutual Information
***Note: It takes too long!!!***

In [5]:
# %%time

# # Use Features with abs(corr) > 0.01
# # 477 out of 1098 Features
# flag_corr          = (result_corr.corr_train.abs() > 0.01)
# features_flag_corr = result_corr[flag_corr].feature.tolist()
# features_flag_corr.sort()

# # ### Mutual Information for ALL Features
# MI                 = FS.mutual_info_classif(train_X[features_flag_corr], train_y)
# result_MI          = pd.DataFrame({'feature': features_flag_corr, 'mutual_info': MI})
# result_MI          = result_MI.sort_values('mutual_info', ascending=False)
# result_MI['ranking_MI'] = np.arange(1, result_MI.shape[0]+1, 1)

# result_MI.to_pickle('data_new/CLUSTER_result_MI.pkl')

# # CPU times: user 7h 55s, sys: 2min 30s, total: 7h 3min 26s
# # Wall time: 7h 3min 4s

In [10]:
result_MI = pd.read_pickle('data_new/CLUSTER_result_MI.pkl')
result_MI
# result_MI.to_csv('data_new/CLUSTER_result_MI.csv')

### Saving Feature Selection Results

In [56]:
# Append Feature Description from Churn Data Dictionary
df_dictionary = pd.read_csv('data_new/Master Customer Attribute Dictionary 03 19 2019.csv', \
                            usecols=['Attribute', 'Description', 'Type'], encoding = 'unicode_escape')

result_tree['Attribute'] = result_tree.feature.str.strip('NUM_').str.strip('CAT_')
result_all    = result_tree.merge(result_corr, how='left', on='feature').\
                            merge(result_MI, how='left', on='feature').\
                            merge(result_anova, how='left', on='feature').\
                            merge(result_chi2, how='left', on='feature').\
                            merge(df_dictionary, how='left', on='Attribute')
result_all.drop(['Attribute', 'Type'], axis=1, inplace=True)

# Save in pickle
result_all.to_pickle('data_new/CLUSTER_result_ALL.pkl')
result_all.head(n=200).to_pickle('data_new/CLUSTER_result_ALL_Top200.pkl')

# Save in csv
result_all.to_csv('data_new/CLUSTER_result_ALL.csv', index=False)
result_all.head(n=200).to_csv('data_new/CLUSTER_result_ALL_Top200.csv', index=False)

result_all

Unnamed: 0,feature,roc_auc,ranking_tree,corr_train,corr_test,ranking_corr,mutual_info,ranking_MI,anova,pval_anova,ranking_anova,chi2,pval_chi2,ranking_chi2,Description
0,NUM_restartten,0.633885,1,-0.046872,-0.048154,28,0.002546,365.0,3660.141903,0.000000e+00,28,,,,months from restart date/latest install
1,NUM_kom_tenure,0.633712,2,-0.046631,-0.048314,29,0.002477,367.0,3622.593591,0.000000e+00,29,,,,KOM tenure (calculated from KOM field Cust_ten...
2,CAT_grp_tenure_6m,0.631704,3,0.058728,0.060893,3,0.018248,280.0,5753.327759,0.000000e+00,3,35357.991057,0.000000,3.0,
3,CAT_competitive_area,0.630846,4,0.048821,0.056509,21,0.103054,55.0,3971.701591,0.000000e+00,21,4165.913453,0.000000,108.0,
4,CAT_grp_tenure_3m,0.630650,5,0.059031,0.060119,2,0.009491,310.0,5813.047720,0.000000e+00,2,70555.657341,0.000000,2.0,
5,CAT_grp_tenure_1m,0.629519,6,0.059781,0.059305,1,0.004564,343.0,5962.108869,0.000000e+00,1,215811.510305,0.000000,1.0,
6,CAT_GRP_CALL_BILL-grp_tenure_3m,0.622124,7,0.053670,0.054585,5,0.038551,201.0,4802.196635,0.000000e+00,5,26911.277402,0.000000,4.0,
7,CAT_GRP_CALL_TSR-grp_tenure_3m,0.621309,8,0.053311,0.053101,6,0.065959,112.0,4737.894560,0.000000e+00,6,12874.317488,0.000000,18.0,
8,CAT_GRP_PAYMENT_CHANGE_10DOLLAR-grp_tenure_3m,0.619866,9,0.051986,0.053055,8,0.079967,80.0,4504.664588,0.000000e+00,8,14298.272714,0.000000,16.0,
9,CAT_GRP_PAYMENT_25DOLLAR-grp_tenure_3m,0.619772,10,0.053701,0.054279,4,0.112671,41.0,4807.758040,0.000000e+00,4,8993.265276,0.000000,33.0,


In [7]:
%%time
fe          = 'CAT_ethnic'
fe_original = fe.replace('CAT_', '')
map1        = CLUSTER_category_mapper[fe_original]
map2        = pd.Series({i:j for j, i in map1.items()})

tmp1        = pd.concat([train_y, train_X], axis=1).groupby(fe)['status'].mean()
tmp2        = pd.concat([test_y, test_X], axis=1).groupby(fe)['status'].mean()
tmp3        = pd.DataFrame({fe_original: map2, 'TRAIN': tmp1, 'TEST': tmp2})
print(tmp3)

                            ethnic     TRAIN      TEST
CAT_ethnic                                            
0                 WESTERN EUROPEAN  0.011948  0.012104
1                    MEDITERRANEAN  0.012201  0.012603
2                     SCANDINAVIAN  0.012491  0.011770
3             MIDDLE EAST NON-ARAB  0.013673  0.013662
4                 EASTERN EUROPEAN  0.014408  0.013569
5                 AFRICAN AMERICAN  0.015745  0.015576
6           CARIBBEAN NON-HISPANIC  0.016373  0.017840
7                  SOUTHEAST ASIAN  0.018505  0.019351
8                        all_other  0.019874  0.020135
9                       EAST ASIAN  0.021664  0.019941
10                        HISPANIC  0.022803  0.021650
11                     SOUTH ASIAN  0.025634  0.027096
CPU times: user 14.1 s, sys: 25.6 s, total: 39.7 s
Wall time: 39.7 s


In [8]:
%%time
fe          = 'CAT_competitive_area'
fe_original = fe.replace('CAT_', '')
map1        = CLUSTER_category_mapper[fe_original]
map2        = pd.Series({i:j for j, i in map1.items()})

tmp1        = pd.concat([train_y, train_X], axis=1).groupby(fe)['status'].mean()
tmp2        = pd.concat([test_y, test_X], axis=1).groupby(fe)['status'].mean()
tmp3        = pd.DataFrame({fe_original: map2, 'TRAIN': tmp1, 'TEST': tmp2})
print(tmp3)

                               competitive_area     TRAIN      TEST
CAT_competitive_area                                               
0                          Non-Competitive Area  0.008417  0.007479
1                      U-verse Competitive Area  0.009252  0.008077
2                     Pre-Fios Competitive Area  0.016300  0.015230
3                         Fios Competitive Area  0.017105  0.017094
4                     Fios ONT Competitive Area  0.031425  0.034022
CPU times: user 14.3 s, sys: 25.6 s, total: 39.8 s
Wall time: 39.8 s


In [40]:
%%time
fe          = 'CAT_archetype'
fe_original = fe.replace('CAT_', '')
map1        = CLUSTER_category_mapper[fe_original]
map2        = pd.Series({i:j for j, i in map1.items()})

tmp1        = pd.concat([train_y, train_X], axis=1).groupby(fe)['status'].mean()
tmp2        = pd.concat([test_y, test_X], axis=1).groupby(fe)['status'].mean()
tmp3        = train_X[fe].value_counts(normalize=True)
tmp4        = pd.DataFrame({fe_original: map2, 'TRAIN': tmp1, 'TEST': tmp2, 'Seg_Pct': tmp3})
print(tmp4)

                   archetype     TRAIN      TEST   Seg_Pct
0                  all_other  0.010025  0.013678  0.004800
1    Indulgent Empty Nesters  0.010709  0.011297  0.220420
2    Busy Families Achievers  0.013569  0.014962  0.197512
3              Empty Nesters  0.016355  0.015695  0.309885
4         Movers and Shakers  0.016980  0.017437  0.041485
5     Busy Families Dreamers  0.019887  0.017937  0.039596
6  Industrious Entertain-Mes  0.019917  0.018596  0.097829
7         Tech Savvy Singles  0.020465  0.018698  0.066695
8              Tech Sensible  0.020938  0.019476  0.021778
CPU times: user 14.9 s, sys: 28.5 s, total: 43.4 s
Wall time: 43.4 s
