# FEATURE ENGINEERING OPERATIONS - {"CUSTOMER CHURN" DATASET}

## 1. Importing Modules and Setting Configurations

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb

from scipy import stats as sts

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectKBest, mutual_info_classif

from pickle import dump, load

import warnings
warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(display='diagram')

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

In [3]:
# SB Options

sb.set_theme(context='notebook', style='whitegrid', palette='pastel', font='times new roman', font_scale=1.25)

## 2. Feature Enginerring Train and Test Datasets

### 2.1 Prepare Target Feature

In [4]:
def prep_target(ytar):
    ytar = ytar.apply(lambda x: 1 if x=='yes' else 0)
    #ytar = ytar.astype('category')
    
    return ytar

### 2.2 Handle Oultiers

In [5]:
def outliers_detect_handle(df):
     
    fea_flo = df.select_dtypes(include='float').columns.values.tolist()
    fea_int = df.select_dtypes(include='int').columns.values.tolist()

    fea_num = fea_flo + fea_int
    fea_num
    
    # OUTLIER DETECTION CODE ----------------------------------------------------------------------------------------------
    
    for fea in fea_num:
      
        print(f'Outlier Detection for Feature : {fea} ------------------------------------------------------------------- \n')

        mn = df[fea].min()
        mx = df[fea].max()
        #print(f'Minimum Value : {mn} and Maximum Value : {mx} \n')

        q1,q3 = df[fea].quantile([0.25,0.75])         #  for 1st and 3rd quartile
        #print(f'1st Quartile : {q1} and 3rd Quartile : {q3} \n')

        lb = round(q1 - (q3-q1)*1.5,4)
        ub = round(q3 + (q3-q1)*1.5,4)
        #print(f'Lower Bound : {lb} and Higher Bound : {ub} \n')

        filtl = df[fea] < lb
        out_low = df[filtl]
        #print(f'No. of Outliers below Lower Bound ({lb}) are : {out_low.shape[0]}')

        filtu = df[fea] > ub
        out_high = df[filtu]
        #print(f'No. of Outliers above Upper Bound ({ub}) are : {out_high.shape[0]}')

        out_df = df[(filtl | filtu)]
        print(f' --> Total No. of Outliers before : {out_df.shape[0]} \n')

        per = (out_df.shape[0]/df[fea].shape[0])*100
        #print(f'Percentage of Outliers Records are : {round(per,4)} % \n\n')

#         plt.figure(figsize=(15,5))
#         plt.subplot(2,2,1)
#         plt.title('KDE Plot Before')
#         sb.kdeplot(data=df, x=fea)

#         plt.subplot(2,2,2)
#         plt.title('BOX Plot Before')
#         sb.boxplot(data=df, x=fea)

        # OUTLIER HANDLING USING CAPPING TECHNIQUE -----------------------------------------------------------------------------
        df.loc[filtl,fea] = lb                        
        df.loc[filtu,fea] = ub

        # OUTLIER DETECTION CODE -----------------------------------------------------------------------------------------------
        filtl = df[fea] < lb
        out_low = df[filtl]
        # print(f'No. of Outliers below Lower Bound ({lb}) are : {out_low.shape[0]}')

        filtu = df[fea] > ub
        out_high = df[filtu]
        # print(f'No. of Outliers above Upper Bound ({ub}) are : {out_high.shape[0]}')

        out_df = df[(filtl | filtu)]
        print(f' --> Total No. of Outliers after : {out_df.shape[0]} \n')

        per = (out_df.shape[0]/df[fea].shape[0])*100
        #print(f'Percentage of Outliers Records are : {round(per,4)} %')

#         plt.subplot(2,2,3)
#         plt.title('KDE Plot After')
#         sb.kdeplot(data=df, x=fea)

#         plt.subplot(2,2,4)
#         plt.title('BOX Plot After')
#         sb.boxplot(data=df, x=fea)

#         plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.1, hspace=0.8)
#         plt.show()
   
    return df

### 2.3 Feature Transformations

In [6]:
# cols=[1,5,7,10,13,15,16,17,18]

num_ft_pt = ColumnTransformer([
    ('yj',PowerTransformer(method='yeo-johnson', standardize=False),[1,5,7,10,13,15,16,17,18])
    ], remainder='passthrough')

### 2.4 Scaling

In [7]:
num_scl_ss = ColumnTransformer([
                ('ss',StandardScaler(),[0,1,2,3,4,5,6,7,8,13,14,15,16,17,18])
                ],
                remainder='passthrough')

### 2.5 Categorical Feature Encoding

In [8]:
cat_enc_ohe = ColumnTransformer([
                ('ohe',OneHotEncoder(drop='first', sparse_output=False, dtype='int8'),[15,16,17,18])
                ],
                remainder='passthrough')

In [9]:
steps1 = [('pt',num_ft_pt),
     ('ss',num_scl_ss),
     ('ohe',cat_enc_ohe)
     ]

pipe1 = Pipeline(steps1)            # for training dataset

### 2.6 Imbalance Dataset Handling

In [10]:
sm = SMOTE(random_state=46)

### 2.7 Feature Selection Technique

In [11]:
skb = SelectKBest(mutual_info_classif, k='all')

## 3 Training Data

### 3.1 Train Dataset

In [12]:
tr = pd.read_pickle('cc_train_pp.pkl')


print(f'Shape of the train dataset : {tr.shape}')
tr.head(5)

Shape of the train dataset : (4150, 20)


Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
829,KS,110,area_code_415,yes,yes,27,267.9,103,45.54,263.3,74,22.38,178.1,106,8.01,8.3,2,2.24,1,yes
1425,PA,79,area_code_415,no,no,0,268.3,114,45.61,185.5,111,15.77,264.6,88,11.91,6.3,7,1.7,1,yes
144,MN,62,area_code_415,no,no,0,147.1,91,25.01,190.4,107,16.18,195.2,115,8.78,12.2,3,3.29,0,no
155,WA,51,area_code_408,no,no,0,232.4,109,39.51,187.4,95,15.93,231.2,107,10.4,9.1,3,2.46,1,no
1660,NY,60,area_code_510,no,yes,31,191.8,75,32.61,267.8,135,22.76,200.5,62,9.02,12.8,3,3.46,2,no


In [13]:
Xtr = tr.drop(columns='churn')
ytr = tr['churn']

### 3.2 FE Steps

In [14]:
ytr = prep_target(ytr)

In [15]:
Xtr = outliers_detect_handle(Xtr)

Outlier Detection for Feature : total_day_minutes ------------------------------------------------------------------- 

 --> Total No. of Outliers before : 22 

 --> Total No. of Outliers after : 0 

Outlier Detection for Feature : total_day_charge ------------------------------------------------------------------- 

 --> Total No. of Outliers before : 22 

 --> Total No. of Outliers after : 0 

Outlier Detection for Feature : total_eve_minutes ------------------------------------------------------------------- 

 --> Total No. of Outliers before : 34 

 --> Total No. of Outliers after : 0 

Outlier Detection for Feature : total_eve_charge ------------------------------------------------------------------- 

 --> Total No. of Outliers before : 34 

 --> Total No. of Outliers after : 0 

Outlier Detection for Feature : total_night_minutes ------------------------------------------------------------------- 

 --> Total No. of Outliers before : 34 

 --> Total No. of Outliers after : 0 



In [16]:
# dump(Xtr, open('Xtr_prod.pkl','wb'))
# print('"Xtr" Features Saved Successfully')

In [17]:
Xtr.to_csv('Xtr_prod.csv', index=False)
print('"Xtr" Features Saved Successfully')

"Xtr" Features Saved Successfully


In [18]:
Xtr = pipe1.fit_transform(Xtr,ytr)

In [19]:
Xtr, ytr = sm.fit_resample(Xtr,ytr)

In [20]:
Xtr = skb.fit_transform(Xtr, ytr)
print(skb.get_support())

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]


In [21]:
Xtr.shape

(7132, 69)

In [22]:
Xtr[:1]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  1.        ,  0.26828088,
         1.67844957,  0.15721164, -1.32572117,  0.31261845, -0.74529324,
        -1.13227713, -0.74779116, -0.31021018,  1.6291093 ,  1.62868227,
         1.26279779,  1.26264614, -0.45014995, -0.4

### 3.3 Save the train FE data as CSV, PKL file

In [23]:
Xtr = pd.DataFrame(Xtr)                # ndarray to df

In [24]:
tr = pd.concat([Xtr,ytr], axis=1)   # concat feature df and target series

In [25]:
tr.shape

(7132, 70)

In [26]:
tr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,churn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.1323,-0.7478,-0.3102,1.6291,1.6287,1.2628,1.2626,-0.4501,-0.4522,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.1513,-1.4743,-0.3102,1.6365,1.6363,-0.2954,-0.2949,1.2846,1.2859,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.5171,0.7093,-1.5084,-0.6158,-0.6155,-0.1973,-0.1983,-0.1072,-0.109,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.5171,-0.447,-0.3102,0.9694,0.9695,-0.2573,-0.2572,0.6148,0.613,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.5171,0.95,0.5687,0.2149,0.2153,1.3529,1.3522,-0.0009,-0.0021,0


In [27]:
tr.to_csv('cc_train_fe.csv', index=False)
tr.to_pickle('cc_train_fe.pkl')

print('Train dataset saved successfully in CSV and PKL files ...')

Train dataset saved successfully in CSV and PKL files ...


## 4. Test Dataset FE

### 4.1 Test Dataset

In [28]:
te = pd.read_pickle('cc_test_pp.pkl')

print(f'Shape of the test dataset : {te.shape}')
te.head(5)

Shape of the test dataset : (100, 20)


Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
1688,IN,86,area_code_510,no,no,0,216.3,96,36.77,266.3,77,22.64,214.0,110,9.63,4.5,3,1.22,0,no
3993,WA,136,area_code_415,no,yes,24,243.3,107,41.36,226.8,111,19.28,138.5,117,6.23,11.0,4,2.97,4,no
1056,TN,98,area_code_415,no,no,0,162.8,65,27.68,185.0,109,15.73,219.5,104,9.88,6.0,3,1.62,2,no
1120,NH,111,area_code_510,no,no,0,197.1,117,33.51,227.8,128,19.36,214.0,101,9.63,9.3,11,2.51,0,no
577,WI,165,area_code_510,no,no,0,154.2,91,26.21,268.6,108,22.83,188.8,99,8.5,10.9,4,2.94,6,no


In [29]:
Xte = te.drop(columns='churn')
yte = te['churn']

### 4.2 FE Steps

In [30]:
yte = prep_target(yte)

In [31]:
steps2 = [('pp1',pipe1),
     ('sb',skb)
     ]

pipe2 = Pipeline(steps2)

dump(pipe2, open('cc_pipe2_te.pkl','wb'))
print('"pipe2" Features Saved Successfully')

"pipe2" Features Saved Successfully


In [32]:
Xte = pipe2.transform(Xte)
Xte.shape

(100, 69)

In [33]:
Xte[:1]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -0.33544066845367265,
        -0.5960152035548312, -0.1992504481887499, -1.1731415239493146,
        0.5124556116533263, -2.110244004652026, -0.5170514360777964,
        -2.1041772097604228, -1.5083770773858174, 0.6701943201283861,
        0.670008756721054, 1.3228823852307443, 1.3239096295125783,
        0.2698340357874689, 0.2697835841663762]], dtype=object)

In [34]:
Xte=Xte.astype('float')

### 4.3 Save the test FE data as CSV, PKL file

In [35]:
yte = yte.values
yte = yte.reshape((100,1))
yte[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [36]:
te = np.concatenate([Xte,yte], axis=1)   

In [37]:
te = pd.DataFrame(te)   # array to df
te.rename(columns={69:'churn'}, inplace=True)
te.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,churn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.5171,-2.1042,-1.5084,0.6702,0.67,1.3229,1.3239,0.2698,0.2698,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0098,0.2596,1.9097,1.172,1.1718,0.5318,0.5322,-1.2443,-1.2455,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.5171,-1.5804,0.5687,-0.324,-0.3236,-0.3054,-0.3043,0.3801,0.3812,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.2634,-0.3783,-1.5084,0.3134,0.3136,0.5518,0.551,0.2698,0.2698,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0098,0.2177,2.9622,-0.4838,-0.4843,1.3689,1.3687,-0.2356,-0.2338,0.0


In [38]:
te.shape

(100, 70)

In [39]:
te.to_csv('cc_test_fe.csv', index=False)
te.to_pickle('cc_test_fe.pkl')

print('Test dataset saved successfully in CSV and PKL files ...')

Test dataset saved successfully in CSV and PKL files ...
