# Analyze Feature Correlation
- analyze high correlation between all numeric features
- analyze correlation with target variable specifically

In [4]:
# Import necessary libraries
from utils import pd, np

In [5]:
# Import application data
train_applications = pd.read_csv('../data/application_train.csv')

In [26]:
# Function to analyze correlation in the dataset
def analyze_correlation(data, threshold=0.5, column=None):
    """
    Analyze the correlation between numeric features in the dataset.

    Parameters:
    data (DataFrame): The input dataset.
    threshold (float): The correlation threshold to consider.
    column (str, optional): If provided, only analyze correlation with this column.

    Returns:
    DataFrame: A DataFrame containing pairs of features with correlation above the threshold and their scores.
    """
    numeric_data = data.select_dtypes(include=[np.number])
    corr_matrix = numeric_data.corr().abs()
    if column is not None:
        if column not in corr_matrix.columns:
            raise ValueError(f"Column '{column}' not found in numeric columns.")
        corr_series = corr_matrix[column].drop(column)
        high_corr = corr_series[corr_series > threshold]
        result = pd.DataFrame({
            'Feature': high_corr.index,
            'Correlation': high_corr.values
        }).sort_values(by='Correlation', ascending=False).reset_index(drop=True)
        return result
    else:
        high_corr_pairs = []
        columns = corr_matrix.columns
        for i in range(len(columns)):
            for j in range(i + 1, len(columns)):
                if corr_matrix.iloc[i, j] > threshold:
                    high_corr_pairs.append((columns[i], columns[j], corr_matrix.iloc[i, j]))
        result = pd.DataFrame(high_corr_pairs, columns=['Feature 1', 'Feature 2', 'Correlation'])
        result = result.sort_values(by='Correlation', ascending=False).reset_index(drop=True)
        return result


In [None]:
#Analyze correlation in the training applications dataset
correlation_results = analyze_correlation(train_applications, threshold=0.85)
display (correlation_results)

Unnamed: 0,Feature 1,Feature 2,Correlation
0,DAYS_EMPLOYED,FLAG_EMP_PHONE,0.999755
1,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,0.998495
2,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,0.998490
3,FLOORSMIN_AVG,FLOORSMIN_MEDI,0.997241
4,FLOORSMAX_AVG,FLOORSMAX_MEDI,0.997034
...,...,...,...
84,LIVINGAREA_MODE,LIVINGAPARTMENTS_MEDI,0.857379
85,ELEVATORS_MODE,LIVINGAREA_MODE,0.855978
86,ELEVATORS_MODE,LIVINGAREA_MEDI,0.855767
87,LIVINGAREA_AVG,ELEVATORS_MODE,0.852591


In [25]:
#Analyze correlation with a target column
target_column = 'TARGET'
correlation_with_target = analyze_correlation(train_applications, threshold=0.1, column=target_column)
display(correlation_with_target)

Unnamed: 0,Feature,Correlation
0,EXT_SOURCE_3,0.178919
1,EXT_SOURCE_2,0.160472
2,EXT_SOURCE_1,0.155317


# Key Takeaway
- 89 features have a high correlation above .85
- There is no high correlation of features with the target variable. Seems like additionally features should be collected from other data files in the dataset