In [1]:
import polars as pl
import time
import random
from typing import List
from pandas.core.frame import DataFrame
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
def reduce_memory_usage(df: DataFrame) -> DataFrame:
    """Reduced memory usage by downcasting datatype of columns.
    Input: DataFrame
    Output: DataFrame"""

    for column in df:
        if df[column].dtype == "float64":
             df[column]=pd.to_numeric(df[column], downcast="float")
        if df[column].dtype == "int64":
            df[column]=pd.to_numeric(df[column], downcast="integer")
    print(df.info())
    return df

In [3]:
# Checking how many missing values
def plot_missing(df: DataFrame, df_name: str) -> None:
    """Plots bar graph with percentages of missing values in the dataframe."""

    nr_col_nan = (df.isna().sum(axis=0) > 0).sum()
    print('-'*100)
    print(f"Number of columns in {df_name}: {df.shape[1]} columns")
    print(f"Number of columns with missing values in {df_name}: {nr_col_nan} columns")
    print('-'*100)
    # Data to plot
    plot_data = (df.isnull().mean().reset_index().rename(columns={'index': 'column', 0: 'fraction'})
    .sort_values(by='fraction', ascending=False)[:nr_col_nan])
    # Plotting the Bar-Plot for NaN percentages 
    plt.figure(figsize = (20, 8), tight_layout = True)
    sns.barplot(x='column', y='fraction', data=plot_data )
    plt.xticks(rotation = 90)
    plt.title(f'Fraction of NaN values in {df_name}')

In [4]:
def duplicates_check(df: DataFrame, df_name: str) -> DataFrame:
    """Checks if there are any duplicated values in the dataframe and removes them.
    Input: df(DataFrame), df_name(name of the DataFrame
    Returns: df"""
    
    print('-'*100)
    print(f"{df_name} had {df.shape[0]- df.drop_duplicates().shape[0]} rows removed.")
    print('-'*100)
    return df.drop_duplicates()

In [5]:

def pie_plot(df: DataFrame, col_name: str, labels: List) -> None:
    """Plots a pie chart of distribution of chosen column."""
    
    # Data to plot
    plot_data = df[col_name].value_counts() / df.shape[0]
    
    # Pie plot
    plt.title(f"Pie chart of {col_name} variable")
    plt.pie(plot_data, labels=labels, labeldistance=1.15, wedgeprops= { 'linewidth' : 3, 'edgecolor' : 'white' }, autopct='%1.1f%%')

In [6]:
def get_object_columns(df: DataFrame, target_name: str) -> List:
    """Gets all features which categorical features of object type from the dataset.
    Parameters: df (DataFrame), target_name (name of the target column)
    Returns: list of names of categorical columns of object type"""
    
    # Make a list
    obj_col = df.select_dtypes(include='O').columns.to_list()
    # Check if target column in list
    if target_name in obj_col:
        obj_col.remove(target_name)
    # Return the list
    return obj_col

def get_cat_encoded_columns(df: DataFrame, target_name: str) -> List:
    """Gets all categorical features which are 0-1 encoded.
    Parameteres: df (DataFrame), target_name (name of the target column)
    Returns: list of names of columns of categorical encoded features"""
    
    # Initialize the list
    encoded_col = []
    # Check if columns values in 0 and 1
    for col in df.columns:
        if set([0 , 1]).issuperset(df[col].dropna()):
            encoded_col.append(col)
    # Checks if target column in list      
    if target_name in encoded_col:
        encoded_col.remove(target_name)
    #  Return the list of encoded columns     
    return encoded_col

def get_num_col(df: DataFrame, encoded_col: List, target_name: str) -> List:
    """Gets numerical continuous column names from the dataset.
    Parameters: df (DataFrame), encoded_col (list of categorical 
    values encoded to 0-1, target_name (name of target colum)
    Returns: list of numerical continuous values."""
    
    # Make a list of all numerical values
    num_col = df.select_dtypes(exclude='O').columns.to_list()
    # Remove encoded categorical columns
    num_col = [col for col in num_col if col not in encoded_col]
    # Checks if target column in list      
    if target_name in num_col:
        num_col.remove(target_name)
    # Return numerical continuous list
    return num_col

In [7]:
def corr_check_num_cat(df: DataFrame, target_name: str, numerical_col: List) -> List:
    """Calculates p_value for anova test to check if the numerical variables are correlated
    with target. If p_value is less then significance level 0.05 rejcts the null hypothesis 
    that variables are not correlated. 
    Paramteres: df (DataFrame), target_name (name of the target column as str), 
    numerical_col (list of numerical continuous columns).
    Returns a list of potentially correlated columns."""
    
    from scipy.stats import f_oneway
    
    # Go through all variables and append results
    p_values = []
    for col in numerical_col:
        category_group_list = df[[col, target_name]].dropna().groupby(col)[target_name].apply(list)
        anova = f_oneway(*category_group_list)
        p_values.append(anova[1])
    anova_df = pd.DataFrame({'column_name': numerical_col, 'p_value': p_values}).sort_values(by='p_value')
    # List of column names where we can reject the null hypothesis that the variables are not correlated with each other
    corr_col = anova_df[anova_df["p_value"] < 0.05]['column_name'].to_list()
    # Prints columns that are correlated with target
    print('-'*100)
    print(f'The columns that are correlated with target column: {corr_col}')
    print('-'*100)
    # Return the list of correlated columns
    return corr_col

In [8]:
def corr_check_cat_cat(df: DataFrame, target_name:str, cat_col: List) -> List:
    """Performs Chi_Sq test for two categorical variables and finds
    the probability of null hypothesis.
    H0: The two columns are NOT related to each other
    alpha: 0.05
    Parameters: df (DataFrame), target_name (name of target column as str),
    cat_col (list of all categorical columns)
    Returns: a list of potentially correlated columns.
    """
    from scipy.stats import chi2_contingency

    # Go through all variables and append results
    p_values = []
    for col in cat_col:
        cross_result = pd.crosstab(index=df[col], columns=df[target_name])
        chi_square = chi2_contingency(cross_result)
        p_values.append(chi_square[1])
    # DataFrame with p_values and names of columns
    chi_square_df = pd.DataFrame({'column_name': cat_col, 'p_value': p_values}).sort_values(by='p_value')
    # List of column names where we can reject the null hypothesis that the variables are not correlated with each other
    corr_col = chi_square_df[chi_square_df["p_value"] < 0.05]['column_name'].to_list()
    # Print out the names of columns correlated with target
    print('-'*100)
    print(f'The columns that are correlated with target column: {corr_col}')
    print('-'*100)
    # Return correlated columns
    return corr_col

In [55]:
def magnify() -> List:
    '''Function that magnifies clicked value in DataFrame.'''
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])]

def high_corr_background(cell_value: any) -> str:
    '''Assigns background of the cell based on the value. 
    Highlights highly correlated columns with correlation 
    higher than 0.7 to red, lower than -0.7 to blue and 
    1 to green.'''
    
    # Set the colors
    highlight_positive = 'background-color: red'
    highlight_negative = 'background-color: blue'
    default = 'background-color: white'
    onces = 'background-color: green'
    
    # Conditions
    if type(cell_value) in [float, int]:
        if (cell_value>=0.7 and cell_value<1.0):
            return highlight_positive
        elif cell_value <=-0.7:
            return highlight_negative
        elif cell_value == 1.0:
            return onces
    return default

def pearson_corr_df(df: DataFrame) -> None:
    '''Calculates Pearson correlations between features.
    Displays styled dataframes for visual clarity of correlations.'''
    
    for n in range(1, df.shape[1]//10+1):
        corr = df.corr().iloc[:, (n-1)*10:n*10]
        display(corr.style.applymap(high_corr_background)\
        .format(precision=2)\
        .set_properties(**{'max-width': '60px', 'font-size': '8pt'})\
        .set_caption("Pearson correaltions of numerical columns")\
        .set_table_styles(magnify()))

In [10]:
def plot_cat_bars(df: DataFrame, target_name: str) -> None:
    """Plots bars for all categorical features in dataframe
    in relation to target variable."""
    
    # Calculate the number of subplots
    categorical_columns = df.select_dtypes(include='O').columns.to_list()
    
    # Calculate the number of rows
    if len(categorical_columns)%2 == 0:
        rows = int(len(categorical_columns)/2)
    else:
        rows = int(len(categorical_columns)/2) + 1
     
    # Plot categories over target
    fig, ax = plt.subplots(rows, 2, figsize=(18, rows*4), tight_layout=True)
    fig.suptitle("Categorical features vs target", fontweight="bold", y=1.02)

    for i in range(rows):
        for j in range(2):
            col = categorical_columns[i * 2 + j]
            df_plot = pd.crosstab(df[col], df["TARGET"]).apply(
                lambda x: round(x / sum(x) * 100, 2), axis=0
            )
            df_plot.plot(kind="bar", ax=ax[i, j])
            ax[i, j].bar_label(
                ax[i, j].containers[0],
                fmt="%.0f%%",
                padding=5,
                size=7,
                color="black",
                fontweight="bold",
            )
            ax[i, j].bar_label(
                ax[i, j].containers[1],
                fmt="%.0f%%",
                padding=5,
                size=7,
                color="black",
                fontweight="bold",
            )
            ax[i, j].set_yticks([])
            ax[i, j].set_xticklabels(ax[i, j].get_xticklabels(), rotation=45, size=8)
            ax[i, j].set_xlabel("")
            ax[i, j].set_title(f"{col} vs {target_name}", fontsize=10, fontweight='bold')
            ax[i, j].legend()
            ax[i, j].get_legend().remove()
            sns.despine(left=True)

    fig.legend(labels=[0, 1], title=target_name, bbox_to_anchor=(1.05, 0.90));

In [11]:
def plot_num_box(df: DataFrame, target_name: str, col_drop: List) -> None:
    """Plots boxplots function of numerical columns in regards to target column.
    Parameteres: df (DataFrame), target_name (name of target column as str),
    col_drop (columns to drop before).
    Returns: None
    """
    
    # Get the names of features
    num_columns = df.drop(col_drop, axis=1).select_dtypes(exclude='O').columns.to_list()
    
    # Calculate the number of rows
    if len(num_columns)%2 == 0:
        rows = int(len(num_columns)/2)
    else:
        rows = int(len(num_columns)/2) + 1
     
    # Plot numerical features over target
    fig, ax = plt.subplots(rows, 2, figsize=(18, rows*4), tight_layout=True)
    fig.suptitle("Numerical features vs target", fontweight="bold", y=1.02)

    for i in range(rows):
        for j in range(2):
            col = num_columns[i * 2 + j]
            sns.boxplot(data=df[[target_name, col]].dropna(), x=target_name, y=col, ax=ax[i, j])
            
            ax[i, j].set_title(f"{col} vs {target_name}", fontsize=10, fontweight='bold')

## Getting the data

In [12]:
df = pd.read_csv('application_train.csv')
df.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.02,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.08,0.26,0.14,0.02,0.04,0.97,0.62,0.01,0.0,0.07,0.08,0.12,0.04,0.02,0.02,0.0,0.0,0.03,0.04,0.97,0.63,0.01,0.0,0.07,0.08,0.12,0.04,0.02,0.02,0.0,0.0,0.03,0.04,0.97,0.62,0.01,0.0,0.07,0.08,0.12,0.04,0.02,0.02,0.0,0.0,reg oper account,block of flats,0.01,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.0,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.31,0.62,,0.1,0.05,0.99,0.8,0.06,0.08,0.03,0.29,0.33,0.01,0.08,0.05,0.0,0.01,0.09,0.05,0.99,0.8,0.05,0.08,0.03,0.29,0.33,0.01,0.08,0.06,0.0,0.0,0.1,0.05,0.99,0.8,0.06,0.08,0.03,0.29,0.33,0.01,0.08,0.06,0.0,0.01,reg oper account,block of flats,0.07,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


Let's check the information about the data.

In [14]:
df.info(), df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB


(None, (307511, 122))

The application data has 307,511 entries and 122 columns. 

Let's reduce the space the data takes.

In [15]:
# Calling function to reduce the memory usage
df = reduce_memory_usage(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float32(65), int16(2), int32(2), int8(37), object(16)
memory usage: 128.2+ MB
None


By minimizing the data type we were able to reduce the size of dataframe by more than half.

Let's split the data into training and testing sets and prepare the data for explanatory data analysis.

In [30]:
from sklearn.model_selection import train_test_split
# Get features and target
X = df.drop('TARGET', axis=1)
y = df['TARGET']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100, stratify=y)

# Get the data for eda
df_eda = pd.merge(X_train, y_train, left_index=True, right_index=True)
df_eda = df_eda.drop('SK_ID_CURR', axis=1)

In [31]:
df_eda.shape

(246008, 121)

### Checking correlations between target and features

To check the correaltions we first need to get all categorical and numerical features. Then we will use anova test for correlations between numerical features and target and chi_square for categorical features. We will also check which of the numerical features are highly correlated to remove them from the dataset. 

#### Getting columns of different datatypes

In [38]:
# Categorical columns that are of object type
object_col = get_object_columns(df_eda, 'TARGET')

# Categorical features that are encoded
encoded_col = get_cat_encoded_columns(df_eda, 'TARGET')

# All categorical features
cat_col = object_col + encoded_col

# Numerical continuous variables
num_col = get_num_col(df_eda, encoded_col, 'TARGET')

In [47]:
assert len(cat_col) + len(num_col) == df.shape[1] - 2, "The condition is not true"

### Correlation between numerical features and target using anova test

In [48]:
# Getting columns that are correlated with target 
numerical_corr = corr_check_num_cat(df_eda, 'TARGET', num_col)

----------------------------------------------------------------------------------------------------
The columns that are correlated with target column: ['EXT_SOURCE_3', 'EXT_SOURCE_2', 'REGION_POPULATION_RELATIVE', 'AMT_GOODS_PRICE', 'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT', 'AMT_ANNUITY', 'AMT_CREDIT', 'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'OWN_CAR_AGE', 'FLOORSMAX_MODE', 'FLOORSMAX_MEDI', 'EXT_SOURCE_1', 'HOUR_APPR_PROCESS_START', 'CNT_CHILDREN', 'ELEVATORS_MODE', 'CNT_FAM_MEMBERS', 'FLOORSMIN_MODE', 'AMT_REQ_CREDIT_BUREAU_QRT', 'ELEVATORS_MEDI', 'FLOORSMIN_MEDI', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'ENTRANCES_MEDI', 'FLOORSMAX_AVG', 'ENTRANCES_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'DAYS_BIRTH', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE', 'YEARS_BUILD_MODE', 'ELEVATORS_AVG', 'YEARS_BUILD_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'AMT_REQ_CREDIT_BUREAU_MON', 'YEARS_BUILD_AVG', 'FLOORSMIN_AVG', 'OBS_60_CNT_SOCIAL_CIRCLE', 'YEARS_BEGINEXPLUATATION_AVG', 'OBS_30_

In [53]:
print(f'Number of numerical columns that are not correlated with target: {len(num_col) - len(numerical_corr)}')

Number of numerical columns that are not correlated with target: 32


### Correlations between numerical columns that are correlated to target

In [56]:
# Displaying Pearson correlations between numerical features correlated to target variable
df_pearson = df_eda[numerical_corr]
pearson_corr_df(df_pearson)

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,REGION_POPULATION_RELATIVE,AMT_GOODS_PRICE,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,AMT_ANNUITY,AMT_CREDIT,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE
EXT_SOURCE_3,1.0,0.11,-0.01,0.05,-0.01,-0.01,0.03,0.04,-0.04,-0.04
EXT_SOURCE_2,0.11,1.0,0.2,0.14,-0.29,-0.29,0.13,0.13,-0.03,-0.03
REGION_POPULATION_RELATIVE,-0.01,0.2,1.0,0.1,-0.53,-0.53,0.12,0.1,0.01,0.0
AMT_GOODS_PRICE,0.05,0.14,0.1,1.0,-0.11,-0.1,0.77,0.99,-0.02,-0.02
REGION_RATING_CLIENT_W_CITY,-0.01,-0.29,-0.53,-0.11,1.0,0.95,-0.14,-0.11,0.01,0.02
REGION_RATING_CLIENT,-0.01,-0.29,-0.53,-0.1,0.95,1.0,-0.13,-0.1,0.02,0.02
AMT_ANNUITY,0.03,0.13,0.12,0.77,-0.14,-0.13,1.0,0.77,-0.02,-0.02
AMT_CREDIT,0.04,0.13,0.1,0.99,-0.11,-0.1,0.77,1.0,-0.02,-0.02
DEF_30_CNT_SOCIAL_CIRCLE,-0.04,-0.03,0.01,-0.02,0.01,0.02,-0.02,-0.02,1.0,0.86
DEF_60_CNT_SOCIAL_CIRCLE,-0.04,-0.03,0.0,-0.02,0.02,0.02,-0.02,-0.02,0.86,1.0


Unnamed: 0,OWN_CAR_AGE,FLOORSMAX_MODE,FLOORSMAX_MEDI,EXT_SOURCE_1,HOUR_APPR_PROCESS_START,CNT_CHILDREN,ELEVATORS_MODE,CNT_FAM_MEMBERS,FLOORSMIN_MODE,AMT_REQ_CREDIT_BUREAU_QRT
EXT_SOURCE_3,-0.02,0.0,0.0,0.19,-0.04,-0.04,0.01,-0.03,0.0,-0.02
EXT_SOURCE_2,-0.08,0.13,0.13,0.21,0.16,-0.02,0.11,-0.0,0.1,-0.0
REGION_POPULATION_RELATIVE,-0.08,0.3,0.32,0.1,0.17,-0.02,0.25,-0.02,0.27,-0.0
AMT_GOODS_PRICE,-0.1,0.11,0.11,0.18,0.06,-0.0,0.08,0.06,0.08,0.01
REGION_RATING_CLIENT_W_CITY,0.09,-0.23,-0.25,-0.12,-0.27,0.02,-0.21,0.03,-0.22,0.0
REGION_RATING_CLIENT,0.09,-0.22,-0.23,-0.12,-0.29,0.03,-0.2,0.03,-0.21,0.0
AMT_ANNUITY,-0.1,0.12,0.13,0.12,0.05,0.02,0.09,0.08,0.09,0.01
AMT_CREDIT,-0.1,0.1,0.1,0.17,0.05,0.0,0.07,0.06,0.07,0.01
DEF_30_CNT_SOCIAL_CIRCLE,0.01,-0.03,-0.03,-0.03,-0.0,-0.0,-0.02,-0.0,-0.02,-0.0
DEF_60_CNT_SOCIAL_CIRCLE,0.01,-0.03,-0.03,-0.03,-0.01,-0.0,-0.02,-0.01,-0.02,-0.0


Unnamed: 0,ELEVATORS_MEDI,FLOORSMIN_MEDI,AMT_REQ_CREDIT_BUREAU_YEAR,ENTRANCES_MEDI,FLOORSMAX_AVG,ENTRANCES_MODE,YEARS_BEGINEXPLUATATION_MODE,DAYS_BIRTH,DAYS_ID_PUBLISH,DAYS_LAST_PHONE_CHANGE
EXT_SOURCE_3,0.01,0.0,-0.07,0.01,0.0,0.01,-0.0,-0.2,-0.13,-0.07
EXT_SOURCE_2,0.11,0.11,-0.02,0.03,0.13,0.02,0.01,-0.09,-0.05,-0.2
REGION_POPULATION_RELATIVE,0.27,0.29,0.0,0.03,0.32,0.01,-0.01,-0.03,-0.01,-0.04
AMT_GOODS_PRICE,0.08,0.08,-0.05,0.02,0.11,0.01,0.0,-0.05,-0.01,-0.08
REGION_RATING_CLIENT_W_CITY,-0.23,-0.23,0.01,-0.03,-0.25,-0.01,-0.0,0.01,-0.01,0.03
REGION_RATING_CLIENT,-0.22,-0.23,0.01,-0.02,-0.23,-0.01,0.0,0.01,-0.0,0.03
AMT_ANNUITY,0.1,0.1,-0.01,0.01,0.13,0.01,0.01,0.01,0.01,-0.06
AMT_CREDIT,0.08,0.08,-0.05,0.01,0.1,0.01,0.0,-0.06,-0.01,-0.07
DEF_30_CNT_SOCIAL_CIRCLE,-0.02,-0.02,0.02,-0.0,-0.03,-0.0,-0.0,-0.0,0.0,0.0
DEF_60_CNT_SOCIAL_CIRCLE,-0.02,-0.02,0.02,-0.01,-0.03,-0.01,-0.0,-0.0,0.0,0.0


Unnamed: 0,YEARS_BUILD_MODE,ELEVATORS_AVG,YEARS_BUILD_MEDI,YEARS_BEGINEXPLUATATION_MEDI,AMT_REQ_CREDIT_BUREAU_MON,YEARS_BUILD_AVG,FLOORSMIN_AVG,OBS_60_CNT_SOCIAL_CIRCLE,YEARS_BEGINEXPLUATATION_AVG,OBS_30_CNT_SOCIAL_CIRCLE
EXT_SOURCE_3,0.01,0.01,0.02,-0.0,-0.01,0.02,0.0,-0.0,-0.0,-0.0
EXT_SOURCE_2,0.01,0.11,0.01,0.01,0.05,0.01,0.11,-0.02,0.01,-0.02
REGION_POPULATION_RELATIVE,-0.07,0.28,-0.06,-0.01,0.08,-0.06,0.29,-0.01,-0.01,-0.01
AMT_GOODS_PRICE,0.04,0.08,0.04,0.01,0.06,0.04,0.08,0.0,0.01,0.0
REGION_RATING_CLIENT_W_CITY,0.04,-0.23,0.04,0.0,-0.07,0.04,-0.23,0.03,-0.0,0.03
REGION_RATING_CLIENT,0.05,-0.22,0.04,0.01,-0.07,0.04,-0.23,0.03,0.01,0.03
AMT_ANNUITY,0.03,0.1,0.03,0.01,0.04,0.03,0.1,-0.01,0.01,-0.01
AMT_CREDIT,0.03,0.08,0.03,0.01,0.06,0.03,0.08,0.0,0.01,0.0
DEF_30_CNT_SOCIAL_CIRCLE,-0.01,-0.02,-0.01,-0.0,0.0,-0.01,-0.02,0.34,-0.0,0.33
DEF_60_CNT_SOCIAL_CIRCLE,-0.01,-0.02,-0.01,-0.0,-0.0,-0.01,-0.02,0.26,-0.0,0.26


Looking at the correlations between numerical features we see correlations:
1) AMT_GOODS_PRICE
- AMT_ANNUITY
- AMT_CREDIT
2) REGION_RATING_CLIENT_W_CITY
- REGION_RATING_CLIENT
3) AMT_ANNUITY
- AMT_CREDIT
4) DEF_30_CNT_SOCIAL_CIRCLE
- DEF_60_CNT_SOCIAL_CIRCLE
5) FLOORSMAX_MODE 
- FLOORSMAX_MEDI
- FLOORSMIN_MODE
- FLOORSMIN_MEDI
- FLOORSMAX_AVG
- FLOORSMIN_AVG
6) FLOORSMAX_MEDI
- FLOORSMIN_MODE
- FLOORSMIN_MEDI
- FLOORSMAX_AVG
- FLOORSMIN_AVG
7) CNT_CHILDREN
- CNT_FAMILY_MEMBERS
8) ELEVATORS_MODE
- ELEVATORS_MEDI
- ELEVATORS_AVG
9) FLOORSMIN_MODE
- FLOORSMIN_MEDI
- FLOORSMIN_AVG
- FLOORSMAX_AVG
10) ELEVATORS_MEDI
- ELEVATORS_AVG
11) FLOORSMIN_MEDI
- FLOORSMAX_AVG
- FLOORSMIN_AVG
12) ENTRANCES_MEDI
- ENTRANCES_MODE
13) FLOORSMAX_AVG
- FLOORSMIN_AVG
14) YEARS_BEGINEXPLUATATION_MODE
- YEARS_BEGINXPLUATATION_MEDI
- YEARS_BEGINXPLUATATION_AVG
15) YEARS_BUILD_MODE
- YEARS_BUILD_MEDI
- YEARS_BUILD_AVG
16) YEARS_BUILD_MEDI
- YEARS_BUILD_AVG
17) YEARS_BEGINXPLUATATION_MEDI
- YEARS_BEGINXPLUATATION_AVG
18) OBS_60_CNT_SOCIAL_CIRCLE
- OBS_30_CNT_SOCIAL_CIRCLE


As we need to remove multicolinear features we will remove columns: 
['AMT_GOODS_PRICE', 'REGION_RATING_CLIENT_W_CITY',  'FLOORSMIN_AVG', 'FLOORSMIN_MODE', 'FLOORSMAX_AVG', 'FLOORSMAX_MODE', 'ELEVATORS_MODE', 'ELEVATORS_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'CNT_FAM_MEMBERS', 'FLOORSMIN_MEDI', 'ENTRANCES_MODE']

In [61]:
multicollinear_features = ['AMT_GOODS_PRICE', 'REGION_RATING_CLIENT_W_CITY', 'FLOORSMIN_AVG', 'FLOORSMIN_MODE',
                           'FLOORSMAX_AVG', 'FLOORSMAX_MODE', 'ELEVATORS_MODE', 'ELEVATORS_AVG',
                           'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'YEARS_BUILD_MODE',
                           'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'CNT_FAM_MEMBERS', 'FLOORSMIN_MEDI', 'ENTRANCES_MODE']
numerical = [i for i in numerical_corr if i not in multicollinear_features]
df_pearson = df_eda[numerical]
pearson_corr_df(df_pearson)

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,REGION_POPULATION_RELATIVE,REGION_RATING_CLIENT,AMT_ANNUITY,AMT_CREDIT,DEF_60_CNT_SOCIAL_CIRCLE,OWN_CAR_AGE,FLOORSMAX_MEDI,EXT_SOURCE_1
EXT_SOURCE_3,1.0,0.11,-0.01,-0.01,0.03,0.04,-0.04,-0.02,0.0,0.19
EXT_SOURCE_2,0.11,1.0,0.2,-0.29,0.13,0.13,-0.03,-0.08,0.13,0.21
REGION_POPULATION_RELATIVE,-0.01,0.2,1.0,-0.53,0.12,0.1,0.0,-0.08,0.32,0.1
REGION_RATING_CLIENT,-0.01,-0.29,-0.53,1.0,-0.13,-0.1,0.02,0.09,-0.23,-0.12
AMT_ANNUITY,0.03,0.13,0.12,-0.13,1.0,0.77,-0.02,-0.1,0.13,0.12
AMT_CREDIT,0.04,0.13,0.1,-0.1,0.77,1.0,-0.02,-0.1,0.1,0.17
DEF_60_CNT_SOCIAL_CIRCLE,-0.04,-0.03,0.0,0.02,-0.02,-0.02,1.0,0.01,-0.03,-0.03
OWN_CAR_AGE,-0.02,-0.08,-0.08,0.09,-0.1,-0.1,0.01,1.0,-0.08,-0.08
FLOORSMAX_MEDI,0.0,0.13,0.32,-0.23,0.13,0.1,-0.03,-0.08,1.0,0.09
EXT_SOURCE_1,0.19,0.21,0.1,-0.12,0.12,0.17,-0.03,-0.08,0.09,1.0


Unnamed: 0,HOUR_APPR_PROCESS_START,CNT_CHILDREN,AMT_REQ_CREDIT_BUREAU_QRT,ELEVATORS_MEDI,AMT_REQ_CREDIT_BUREAU_YEAR,ENTRANCES_MEDI,DAYS_BIRTH,DAYS_ID_PUBLISH,DAYS_LAST_PHONE_CHANGE,YEARS_BUILD_MEDI
EXT_SOURCE_3,-0.04,-0.04,-0.02,0.01,-0.07,0.01,-0.2,-0.13,-0.07,0.02
EXT_SOURCE_2,0.16,-0.02,-0.0,0.11,-0.02,0.03,-0.09,-0.05,-0.2,0.01
REGION_POPULATION_RELATIVE,0.17,-0.02,-0.0,0.27,0.0,0.03,-0.03,-0.01,-0.04,-0.06
REGION_RATING_CLIENT,-0.29,0.03,0.0,-0.22,0.01,-0.02,0.01,-0.0,0.03,0.04
AMT_ANNUITY,0.05,0.02,0.01,0.1,-0.01,0.01,0.01,0.01,-0.06,0.03
AMT_CREDIT,0.05,0.0,0.01,0.08,-0.05,0.01,-0.06,-0.01,-0.07,0.03
DEF_60_CNT_SOCIAL_CIRCLE,-0.01,-0.0,-0.0,-0.02,0.02,-0.01,-0.0,0.0,0.0,-0.01
OWN_CAR_AGE,-0.07,0.01,-0.02,-0.06,-0.01,-0.02,0.01,0.01,0.0,-0.05
FLOORSMAX_MEDI,0.12,-0.01,0.0,0.68,-0.02,0.08,0.0,-0.01,-0.01,0.52
EXT_SOURCE_1,0.03,-0.14,-0.0,0.07,0.01,0.02,-0.6,-0.13,-0.13,0.02
