# Exploratory Data Analysis - Home Credit Default Risk (Selected Features)

## Goal
This notebook shows how to explore data using only the most important features for predicting loan defaults.

## Steps:
1. Load processed data with selected features
2. Analyze target variable
3. Statistical analysis of features
4. Correlation analysis
5. Visualize distributions
6. Find important features
7. Draw conclusions

---


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from sklearn.feature_selection import mutual_info_classif
import json

# Settings
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("Set2")
pd.set_option('display.max_columns', None)

# Set plot size
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")
print("Plot settings applied!")


## 1. Load Processed Data with Selected Features


In [None]:
# Load processed data
print("Loading processed data with selected features...")

# Load train data
df = pd.read_csv('../data_processed/train_processed_selected.csv')
print(f"Processed data loaded: {df.shape}")

# Load feature information
with open('../data_processed/feature_info_selected.json', 'r') as f:
    feature_info = json.load(f)

print(f"Data information:")
print(f"  - Total records: {len(df):,}")
print(f"  - Number of features: {df.shape[1]}")
print(f"  - Numerical features: {feature_info['numeric_features']}")
print(f"  - Categorical features: {feature_info['categorical_features']}")

# Basic data info
print(f"\nBasic information:")
print(f"  - Missing values: {df.isnull().sum().sum()}")
print(f"  - Duplicates: {df.duplicated().sum()}")

# Analyze target variable
target_stats = df['TARGET'].value_counts()
print(f"\nTarget variable (TARGET):")
print(f"  - No default (0): {target_stats[0]:,} ({target_stats[0]/len(df)*100:.2f}%)")
print(f"  - Default (1): {target_stats[1]:,} ({target_stats[1]/len(df)*100:.2f}%)")
print(f"  - Imbalance ratio: {target_stats[0]/target_stats[1]:.1f}:1")


## 2. Target Variable Analysis


In [None]:
# Visualize target variable distribution
plt.figure(figsize=(10, 6))

# Create pie chart
plt.subplot(1, 2, 1)
target_counts = df['TARGET'].value_counts()
plt.pie(target_counts.values, labels=['No Default', 'Default'], autopct='%1.1f%%', startangle=90)
plt.title('Target Variable Distribution')

# Create bar chart
plt.subplot(1, 2, 2)
plt.bar(['No Default', 'Default'], target_counts.values, color=['lightblue', 'lightcoral'])
plt.title('Target Variable Counts')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

print(f"Default rate: {df['TARGET'].mean()*100:.2f}%")
print(f"Class imbalance: {target_counts[0]/target_counts[1]:.1f}:1")


## 3. Statistical Analysis of Selected Features


In [None]:
# Statistical analysis of numerical features
print("Statistical analysis of selected features:")

# Get numerical features (exclude ID and target)
numeric_features = [col for col in df.columns if col not in ['SK_ID_CURR', 'TARGET'] and df[col].dtype in ['int64', 'float64']]

print(f"Number of numerical features: {len(numeric_features)}")

# Basic statistics
print("\nBasic statistics for numerical features:")
print(df[numeric_features].describe())

# Check for highly correlated features
print("\nChecking for highly correlated features...")
correlation_matrix = df[numeric_features].corr()

# Find highly correlated pairs
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.8:  # High correlation threshold
            high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], corr_value))

print(f"Found {len(high_corr_pairs)} highly correlated pairs (>0.8):")
for pair in high_corr_pairs[:10]:  # Show first 10
    print(f"  {pair[0]} - {pair[1]}: {pair[2]:.3f}")


## 4. Correlation Analysis with Target


In [None]:
# Correlation analysis with target variable
print("Correlation analysis with target variable:")

# Calculate correlations with target
target_correlations = df[numeric_features].corrwith(df['TARGET']).abs().sort_values(ascending=False)

print(f"\nTop 10 features most correlated with TARGET:")
print(target_correlations.head(10))

# Visualize correlation heatmap
plt.figure(figsize=(15, 12))

# Select top 15 features for heatmap
top_features = target_correlations.head(15).index.tolist()
correlation_subset = df[top_features + ['TARGET']].corr()

# Create heatmap
sns.heatmap(correlation_subset, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Correlation Heatmap - Top 15 Features with Target')
plt.tight_layout()
plt.show()

# Analyze correlation with target
print(f"\nCorrelation analysis summary:")
print(f"Strong positive correlation (>0.1): {len(target_correlations[target_correlations > 0.1])}")
print(f"Weak correlation (0.05-0.1): {len(target_correlations[(target_correlations > 0.05) & (target_correlations <= 0.1)])}")
print(f"Very weak correlation (<0.05): {len(target_correlations[target_correlations <= 0.05])}")


## 5. Feature Distribution Analysis


In [None]:
# Analyze distributions of top features
print("Feature distribution analysis:")

# Select top 6 features for detailed analysis
top_6_features = target_correlations.head(6).index.tolist()

# Create distribution plots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, feature in enumerate(top_6_features):
    # Create histogram for each class
    default_data = df[df['TARGET'] == 1][feature]
    no_default_data = df[df['TARGET'] == 0][feature]
    
    axes[i].hist(no_default_data, bins=50, alpha=0.7, label='No Default', color='lightblue', density=True)
    axes[i].hist(default_data, bins=50, alpha=0.7, label='Default', color='lightcoral', density=True)
    axes[i].set_title(f'{feature}\n(Corr: {target_correlations[feature]:.3f})')
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Box plots for top features
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, feature in enumerate(top_6_features):
    sns.boxplot(data=df, x='TARGET', y=feature, ax=axes[i])
    axes[i].set_title(f'{feature}\n(Corr: {target_correlations[feature]:.3f})')
    axes[i].set_xlabel('Target (0=No Default, 1=Default)')
    axes[i].set_ylabel('Value')

plt.tight_layout()
plt.show()


## 6. Important Features Analysis


In [None]:
# Feature importance analysis using mutual information
print("Feature importance analysis:")

# Prepare data for mutual information
X = df[numeric_features]
y = df['TARGET']

# Calculate mutual information
mi_scores = mutual_info_classif(X, y, random_state=42)
mi_df = pd.DataFrame({
    'feature': numeric_features,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

print(f"\nTop 10 features by Mutual Information:")
print(mi_df.head(10))

# Visualize feature importance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Correlation with target
top_corr_features = target_correlations.head(10)
ax1.barh(range(len(top_corr_features)), top_corr_features.values)
ax1.set_yticks(range(len(top_corr_features)))
ax1.set_yticklabels(top_corr_features.index)
ax1.set_xlabel('Correlation with Target')
ax1.set_title('Top 10 Features by Correlation with Target')
ax1.grid(True, alpha=0.3)

# Mutual information
top_mi_features = mi_df.head(10)
ax2.barh(range(len(top_mi_features)), top_mi_features['mi_score'])
ax2.set_yticks(range(len(top_mi_features)))
ax2.set_yticklabels(top_mi_features['feature'])
ax2.set_xlabel('Mutual Information Score')
ax2.set_title('Top 10 Features by Mutual Information')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Compare correlation and mutual information
print(f"\nComparison of top features:")
comparison_df = pd.DataFrame({
    'feature': top_corr_features.index,
    'correlation': top_corr_features.values,
    'mi_score': [mi_df[mi_df['feature'] == f]['mi_score'].iloc[0] if f in mi_df['feature'].values else 0 for f in top_corr_features.index]
})
print(comparison_df)


## 7. Conclusions and Insights


In [None]:
# Summary of findings
print("=== EDA CONCLUSIONS AND INSIGHTS ===\n")

print("1. DATASET OVERVIEW:")
print(f"   - Total records: {len(df):,}")
print(f"   - Number of features: {df.shape[1]}")
print(f"   - Default rate: {df['TARGET'].mean()*100:.2f}%")
print(f"   - Class imbalance: {target_stats[0]/target_stats[1]:.1f}:1")

print(f"\n2. DATA QUALITY:")
print(f"   - Missing values: {df.isnull().sum().sum()}")
print(f"   - Duplicates: {df.duplicated().sum()}")
print(f"   - All data is clean and ready for modeling")

print(f"\n3. KEY FINDINGS:")
print(f"   - Strongest correlation with default: {target_correlations.index[0]} ({target_correlations.iloc[0]:.3f})")
print(f"   - Top 5 most important features:")
for i in range(5):
    print(f"     {i+1}. {target_correlations.index[i]} (corr: {target_correlations.iloc[i]:.3f})")

print(f"\n4. FEATURE CATEGORIES:")
print(f"   - External data sources (EXT_SOURCE_*) are highly predictive")
print(f"   - Financial ratios and amounts show strong patterns")
print(f"   - Customer demographics play important role")
print(f"   - Credit bureau information is crucial")

print(f"\n5. MODELING RECOMMENDATIONS:")
print(f"   - Use stratified sampling due to class imbalance")
print(f"   - Focus on top 15-20 features for initial models")
print(f"   - Consider feature engineering for external sources")
print(f"   - Apply appropriate evaluation metrics (AUC, Precision-Recall)")

print(f"\n6. NEXT STEPS FOR SIS2:")
print(f"   - Build multiple classification models")
print(f"   - Compare performance across algorithms")
print(f"   - Implement feature selection techniques")
print(f"   - Create ensemble methods for better predictions")

print(f"\n=== EDA ANALYSIS COMPLETED ===")
