# Exploratory Data Analysis - Home Credit Default Risk

## Goal
This notebook shows how to explore data and find patterns that help predict loan defaults.

## Steps:
1. Load processed data
2. Analyze target variable
3. Statistical analysis of features
4. Correlation analysis
5. Visualize distributions
6. Find important features
7. Draw conclusions

---


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from sklearn.feature_selection import mutual_info_classif
import json

# Settings
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("Set2")
pd.set_option('display.max_columns', None)

# Set plot size
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")
print("Plot settings applied!")


## 1. Load Processed Data


In [None]:
# Load processed data
print("Loading processed data...")

# Load train data
df = pd.read_csv('../data_processed/train_processed.csv')
print(f"Processed data loaded: {df.shape}")

# Load feature information
with open('../data_processed/feature_info.json', 'r') as f:
    feature_info = json.load(f)

print(f"Data information:")
print(f"  - Total records: {len(df):,}")
print(f"  - Number of features: {df.shape[1]}")
print(f"  - Numerical features: {feature_info['numeric_features']}")
print(f"  - Categorical features: {feature_info['categorical_features']}")

# Basic data info
print(f"\nBasic information:")
print(f"  - Missing values: {df.isnull().sum().sum()}")
print(f"  - Duplicates: {df.duplicated().sum()}")

# Analyze target variable
target_stats = df['TARGET'].value_counts()
print(f"\nTarget variable (TARGET):")
print(f"  - No default (0): {target_stats[0]:,} ({target_stats[0]/len(df)*100:.2f}%)")
print(f"  - Default (1): {target_stats[1]:,} ({target_stats[1]/len(df)*100:.2f}%)")
print(f"  - Imbalance ratio: {target_stats[0]/target_stats[1]:.1f}:1")


## 2. Target Variable Analysis


In [None]:
# Visualize target variable distribution
plt.figure(figsize=(10, 6))

# Create pie chart
plt.subplot(1, 2, 1)
target_counts = df['TARGET'].value_counts()
plt.pie(target_counts.values, labels=['No Default', 'Default'], autopct='%1.1f%%', startangle=90)
plt.title('Target Variable Distribution')

# Create bar chart
plt.subplot(1, 2, 2)
plt.bar(['No Default', 'Default'], target_counts.values, color=['lightblue', 'lightcoral'])
plt.title('Target Variable Counts')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

print(f"Default rate: {df['TARGET'].mean()*100:.2f}%")
print(f"Class imbalance: {target_counts[0]/target_counts[1]:.1f}:1")
