In [1]:
import numpy as np
from src.data.io_wrapper import read_raw, write_processed
from src.data.preprocessing import minmax_num_atts, impute_missing_dtree
from IPython.display import display, HTML
from src.common.utility import drop_col

def display_df(df):
    display(HTML(df.to_html()))


In [2]:
df = read_raw('bank_churners.csv')

In [3]:
df.columns

Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
      dtype='object')

In [4]:
# irrelevant columns
df = drop_col(df, 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1')
df = drop_col(df, 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2')

Replace unkown with nan

In [5]:
df = df.replace('Unknown', np.nan)

In [6]:
df.isna().sum()

CLIENTNUM                      0
Attrition_Flag                 0
Customer_Age                   0
Gender                         0
Dependent_count                0
Education_Level             1519
Marital_Status               749
Income_Category             1112
Card_Category                  0
Months_on_book                 0
Total_Relationship_Count       0
Months_Inactive_12_mon         0
Contacts_Count_12_mon          0
Credit_Limit                   0
Total_Revolving_Bal            0
Avg_Open_To_Buy                0
Total_Amt_Chng_Q4_Q1           0
Total_Trans_Amt                0
Total_Trans_Ct                 0
Total_Ct_Chng_Q4_Q1            0
Avg_Utilization_Ratio          0
dtype: int64

Ordinal encoding:
(in meta data file)

Card category: Blue < Silver < Gold < Platinum

Income category: Less than '$40K' < '$40K - $60K' < '$60K - $80K' < '$80K - $120K' < '$120K +'

Education level: 'Uneducated' < 'High School' < 'College' < 'Graduate' < 'Post-Graduate' < 'Doctorate'

In [7]:
df['id'] = np.arange(df.shape[0])
df = drop_col(df, 'CLIENTNUM')
df = drop_col(df,
              'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1')
df = drop_col(df,
              'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2')
df = df.replace('Unknown', np.nan)

col_name = "id"
first_col = df.pop(col_name)
df.insert(0, col_name, first_col)

In [8]:
cat_indices = [1, 3, 5, 6, 7, 8]

In [9]:
imputed = impute_missing_dtree(df, cat_indices, ['Education_Level','Marital_Status','Income_Category'])

In [10]:
write_processed('bank_churners.csv', imputed)


In [11]:
df['Marital_Status'].value_counts()

Married     4687
Single      3943
Divorced     748
Name: Marital_Status, dtype: int64

In [12]:
imputed['Marital_Status'].value_counts()

Married     4824
Single      4555
Divorced     748
Name: Marital_Status, dtype: int64