In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
import numpy as np

## Missing values

In [8]:
df = pd.read_csv('data/processed/BankChurners_after-inspection.csv')
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [12]:
numerical_columns = df._get_numeric_data().columns.to_list()
categorical_columns = list(set(df.columns) - set(numerical_columns))
numerical_columns.remove('CLIENTNUM')
numerical_columns

['Customer_Age',
 'Dependent_count',
 'Months_on_book',
 'Total_Relationship_Count',
 'Months_Inactive_12_mon',
 'Contacts_Count_12_mon',
 'Credit_Limit',
 'Total_Revolving_Bal',
 'Avg_Open_To_Buy',
 'Total_Amt_Chng_Q4_Q1',
 'Total_Trans_Amt',
 'Total_Trans_Ct',
 'Total_Ct_Chng_Q4_Q1',
 'Avg_Utilization_Ratio']

In [13]:
[df.columns.get_loc(c) for c in categorical_columns if c in df]
[df.columns.get_loc(c) for c in numerical_columns if c in df]


[8, 7, 6, 5, 3, 1]

As we saw in the data inspection notebook, there were some 'unknown' values in categorical columns of the dataset.
We first changed the values to np.na and now we can use some missing values imputation methods. We will use decision tree to impute the missing values.


In [None]:
def impute_missing_dtree(_df, cat_indices, col_names):
    df = _df.copy()
    num_indices = get_num_atts(df, cat_indices)
    other_col_names = col_names

    for col_name in col_names:
        train_test = _df.copy()
        other_col_names = subtract_lists(other_col_names, [col_name])
        # other columns with missing values will be imputed with modes (categorical suited)
        for other_col_name in other_col_names:
            train_test[other_col_name] = train_test[other_col_name].fillna(train_test[other_col_name].mode()[0])
        # preprocessing
        id_col = train_test['id']
        train_test = minmax_num_atts(train_test, num_indices)
        marital_status = train_test[col_name]
        train_test = pd.get_dummies(drop_col(train_test, col_name))
        train_test[col_name] = marital_status
        train_test['id'] = id_col

        # select rows that dont contain na in given col to train dtree
        df_train = train_test[train_test[col_name].notna()]
        target = df_train[col_name]
        df_train = drop_col(df_train, col_name)
        df_train = drop_col(df_train, 'id')
        # encode target var
        le = preprocessing.LabelEncoder()
        target_enc = le.fit_transform(target)
        # train tree
        tree = DecisionTreeClassifier(min_samples_leaf=10, ccp_alpha=0.01)
        tree.fit(df_train, target_enc)

        # tree will predict value on rows with missing values
        x_test = train_test[train_test[col_name].isna()]
        test_id_col = x_test['id']
        x_test = drop_col(x_test, 'id')
        x_test = drop_col(x_test, col_name)

        y = tree.predict(x_test)
        predicted = le.inverse_transform(y)

        x_test[col_name] = predicted
        # update based on index
        df.update(x_test[col_name])

    return df

Card category: Blue < Silver < Gold < Platinum

Income category: Less than '$40K' < '$40K - $60K' < '$60K - $80K' < '$80K - $120K' < '$120K +'

Education level: 'Uneducated' < 'High School' < 'College' < 'Graduate' < 'Post-Graduate' < 'Doctorate'