In [None]:
# %% [markdown]
# # Home Credit Default Risk - Data Exploration
# 
# ## üìä Overview
# This notebook explores the Home Credit dataset to understand:
# 1. Data structure and relationships
# 2. Missing values and data quality
# 3. Target variable distribution
# 4. Feature distributions and relationships
# 
# ## üéØ Target Variable
# - **TARGET = 1**: Client with payment difficulties
# - **TARGET = 0**: Client without payment difficulties

# %% [markdown]
# ## üì¶ Setup and Imports

# %%
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Import project utilities
import sys
sys.path.append('../src')
from utils import load_config, load_data, display_data_info, get_file_sizes

# %%
# Load configuration
config = load_config()
print("Configuration loaded successfully")

# Check file sizes
file_sizes = get_file_sizes()
print("\nFile sizes in MB:")
for file, size in file_sizes.items():
    print(f"  {file}: {size:.2f} MB")

# %% [markdown]
# ## üìÅ Load Main Data Files

# %%
# Load application_train data (10% sample for quick exploration)
print("Loading application_train data (10% sample)...")
app_train = load_data(config['files']['application_train'], nrows=None)  # Remove nrows for full data
print(f"\nTraining data shape: {app_train.shape}")

# %%
# Display basic information
display_data_info(app_train, "Application Train")

# %% [markdown]
# ## üéØ Target Variable Analysis

# %%
# Check target distribution
if 'TARGET' in app_train.columns:
    target_dist = app_train['TARGET'].value_counts()
    target_pct = app_train['TARGET'].value_counts(normalize=True) * 100
    
    print("Target Distribution:")
    print(f"0 (No Payment Difficulties): {target_dist[0]:,} ({target_pct[0]:.2f}%)")
    print(f"1 (Payment Difficulties): {target_dist[1]:,} ({target_pct[1]:.2f}%)")
    
    # Plot target distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Count plot
    sns.countplot(x='TARGET', data=app_train, ax=axes[0])
    axes[0].set_title('Target Variable Distribution (Count)')
    axes[0].set_xlabel('Target (0=No Default, 1=Default)')
    axes[0].set_ylabel('Count')
    
    # Add count labels on bars
    for i, count in enumerate(target_dist):
        axes[0].text(i, count + 1000, f'{count:,}', ha='center', va='bottom', fontweight='bold')
    
    # Percentage plot
    axes[1].pie(target_dist.values, labels=['No Default', 'Default'], 
                autopct='%1.1f%%', startangle=90, colors=['lightgreen', 'lightcoral'])
    axes[1].set_title('Target Variable Distribution (Percentage)')
    
    plt.tight_layout()
    plt.show()

# %% [markdown]
# ## üîç Missing Values Analysis

# %%
# Calculate missing values percentage for all columns
missing = app_train.isnull().sum()
missing_percent = 100 * missing / len(app_train)
missing_df = pd.DataFrame({
    'missing_count': missing,
    'missing_percent': missing_percent
})

# Sort by missing percentage
missing_df = missing_df[missing_df['missing_count'] > 0].sort_values('missing_percent', ascending=False)

print(f"Columns with missing values: {len(missing_df)}")
print(f"Total missing cells: {missing.sum():,}")
print(f"Overall data missing: {100 * missing.sum() / (app_train.shape[0] * app_train.shape[1]):.2f}%")

# %%
# Visualize top 20 columns with highest missing values
if len(missing_df) > 0:
    plt.figure(figsize=(12, 8))
    top_missing = missing_df.head(20)
    
    plt.barh(range(len(top_missing)), top_missing['missing_percent'])
    plt.yticks(range(len(top_missing)), top_missing.index)
    plt.xlabel('Missing Percentage (%)')
    plt.title('Top 20 Columns with Highest Missing Values')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# %% [markdown]
# ## üìà Numeric Features Analysis

# %%
# Get numeric columns
numeric_cols = app_train.select_dtypes(include=['int', 'float']).columns.tolist()
# Remove target and ID columns
numeric_cols = [col for col in numeric_cols if col not in ['TARGET', 'SK_ID_CURR']]

print(f"Number of numeric columns: {len(numeric_cols)}")
print(f"First 10 numeric columns: {numeric_cols[:10]}")

# %%
# Summary statistics for numeric columns
print("Summary statistics for numeric columns:")
print(app_train[numeric_cols].describe().transpose())

# %%
# Visualize distributions of key numeric features
key_features = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
    'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED'
]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(key_features):
    if feature in app_train.columns:
        # Remove outliers for better visualization
        data = app_train[feature].dropna()
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        filtered_data = data[(data >= Q1 - 1.5 * IQR) & (data <= Q3 + 1.5 * IQR)]
        
        axes[i].hist(filtered_data, bins=50, alpha=0.7, color='skyblue')
        axes[i].set_title(f'Distribution of {feature}')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# %% [markdown]
# ## üìä Categorical Features Analysis

# %%
# Get categorical columns
categorical_cols = app_train.select_dtypes(include=['object']).columns.tolist()
print(f"Number of categorical columns: {len(categorical_cols)}")
print(f"Categorical columns: {categorical_cols}")

# %%
# Analyze key categorical features
key_categorical = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'
]

# Filter to columns that exist
key_categorical = [col for col in key_categorical if col in app_train.columns]

fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.flatten()

for i, feature in enumerate(key_categorical):
    if i < len(axes):
        value_counts = app_train[feature].value_counts()
        
        # For features with many categories, show top 10
        if len(value_counts) > 10:
            top_values = value_counts.head(10)
            axes[i].barh(range(len(top_values)), top_values.values)
            axes[i].set_yticks(range(len(top_values)))
            axes[i].set_yticklabels(top_values.index)
        else:
            axes[i].bar(range(len(value_counts)), value_counts.values)
            axes[i].set_xticks(range(len(value_counts)))
            axes[i].set_xticklabels(value_counts.index, rotation=45, ha='right')
        
        axes[i].set_title(f'{feature} Distribution')
        axes[i].set_xlabel('Count')
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# %% [markdown]
# ## üîó Relationships with Target

# %%
# Analyze how key features relate to target
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Credit amount by target
if 'AMT_CREDIT' in app_train.columns:
    axes[0, 0].boxplot([
        app_train[app_train['TARGET'] == 0]['AMT_CREDIT'].dropna(),
        app_train[app_train['TARGET'] == 1]['AMT_CREDIT'].dropna()
    ], labels=['No Default', 'Default'])
    axes[0, 0].set_title('Credit Amount by Target')
    axes[0, 0].set_ylabel('Credit Amount')
    axes[0, 0].grid(True, alpha=0.3)

# 2. Income by target
if 'AMT_INCOME_TOTAL' in app_train.columns:
    # Use log scale for better visualization
    data_0 = np.log1p(app_train[app_train['TARGET'] == 0]['AMT_INCOME_TOTAL'].dropna())
    data_1 = np.log1p(app_train[app_train['TARGET'] == 1]['AMT_INCOME_TOTAL'].dropna())
    
    axes[0, 1].boxplot([data_0, data_1], labels=['No Default', 'Default'])
    axes[0, 1].set_title('Income (log scale) by Target')
    axes[0, 1].set_ylabel('Log(Income + 1)')
    axes[0, 1].grid(True, alpha=0.3)

# 3. Age by target
if 'DAYS_BIRTH' in app_train.columns:
    # Convert to years
    app_train['AGE'] = -app_train['DAYS_BIRTH'] / 365.25
    axes[0, 2].boxplot([
        app_train[app_train['TARGET'] == 0]['AGE'],
        app_train[app_train['TARGET'] == 1]['AGE']
    ], labels=['No Default', 'Default'])
    axes[0, 2].set_title('Age by Target')
    axes[0, 2].set_ylabel('Age (Years)')
    axes[0, 2].grid(True, alpha=0.3)

# 4. Contract type by target
if 'NAME_CONTRACT_TYPE' in app_train.columns:
    contract_target = pd.crosstab(app_train['NAME_CONTRACT_TYPE'], app_train['TARGET'], normalize='index')
    contract_target.plot(kind='bar', stacked=True, ax=axes[1, 0])
    axes[1, 0].set_title('Contract Type by Target')
    axes[1, 0].set_ylabel('Percentage')
    axes[1, 0].legend(['No Default', 'Default'])
    axes[1, 0].tick_params(axis='x', rotation=45)

# 5. Gender by target
if 'CODE_GENDER' in app_train.columns:
    gender_target = pd.crosstab(app_train['CODE_GENDER'], app_train['TARGET'], normalize='index')
    gender_target.plot(kind='bar', stacked=True, ax=axes[1, 1])
    axes[1, 1].set_title('Gender by Target')
    axes[1, 1].set_ylabel('Percentage')
    axes[1, 1].legend(['No Default', 'Default'])

# 6. Education by target
if 'NAME_EDUCATION_TYPE' in app_train.columns:
    edu_target = pd.crosstab(app_train['NAME_EDUCATION_TYPE'], app_train['TARGET'], normalize='index')
    edu_target = edu_target.sort_values(by=1, ascending=False)  # Sort by default rate
    edu_target.plot(kind='bar', stacked=True, ax=axes[1, 2])
    axes[1, 2].set_title('Education Type by Target')
    axes[1, 2].set_ylabel('Percentage')
    axes[1, 2].legend(['No Default', 'Default'])
    axes[1, 2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# %% [markdown]
# ## üìä Correlation Analysis

# %%
# Calculate correlation matrix for key numeric features
key_numeric = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
    'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
    'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION',
    'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS',
    'CNT_CHILDREN', 'TARGET'
]

# Filter to columns that exist
key_numeric = [col for col in key_numeric if col in app_train.columns]

if len(key_numeric) > 1:
    corr_matrix = app_train[key_numeric].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=0.5)
    plt.title('Correlation Matrix of Key Features')
    plt.tight_layout()
    plt.show()
    
    # Show features most correlated with target
    if 'TARGET' in corr_matrix.columns:
        target_corr = corr_matrix['TARGET'].sort_values(ascending=False)
        print("Features most correlated with TARGET:")
        print(target_corr.head(10))
        print("\nFeatures least correlated with TARGET:")
        print(target_corr.tail(10))

# %% [markdown]
# ## üìÅ Explore Other Data Files

# %%
# Load bureau data (sample)
print("Loading bureau data (5% sample)...")
bureau = load_data(config['files']['bureau'], nrows=50000)  # Adjust nrows as needed
print(f"Bureau data shape: {bureau.shape}")

# %%
# Display bureau data information
if bureau is not None:
    display_data_info(bureau, "Bureau Data")
    
    # Show relationship with main application
    print(f"\nUnique clients in bureau: {bureau['SK_ID_CURR'].nunique()}")
    print(f"Unique loans in bureau: {bureau['SK_ID_BUREAU'].nunique()}")
    
    # Show credit types distribution
    if 'CREDIT_TYPE' in bureau.columns:
        print("\nCredit Types in Bureau:")
        print(bureau['CREDIT_TYPE'].value_counts().head(10))

# %%
# Load previous application data (sample)
print("\nLoading previous application data (5% sample)...")
prev_app = load_data(config['files']['previous_application'], nrows=50000)
print(f"Previous application data shape: {prev_app.shape}")

# %%
# Display previous application information
if prev_app is not None:
    display_data_info(prev_app, "Previous Application Data")
    
    # Show application status
    if 'NAME_CONTRACT_STATUS' in prev_app.columns:
        print("\nPrevious Application Status:")
        status_counts = prev_app['NAME_CONTRACT_STATUS'].value_counts()
        print(status_counts)
        
        plt.figure(figsize=(10, 6))
        status_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90)
        plt.title('Previous Application Status Distribution')
        plt.ylabel('')
        plt.show()

# %% [markdown]
# ## üìù Key Insights

# %%
print("=" * 60)
print("KEY INSIGHTS FROM DATA EXPLORATION")
print("=" * 60)

print("\n1. TARGET DISTRIBUTION:")
print(f"   - Only {target_pct[1]:.2f}% of clients have payment difficulties")
print(f"   - This is a highly imbalanced classification problem")

print("\n2. DATA QUALITY:")
print(f"   - Training data has {app_train.shape[0]:,} rows and {app_train.shape[1]} columns")
print(f"   - {len(missing_df)} columns have missing values")
print(f"   - Overall missing data: {100 * missing.sum() / (app_train.shape[0] * app_train.shape[1]):.2f}%")

print("\n3. KEY PATTERNS:")
print("   - Default rate varies by demographic factors:")
if 'CODE_GENDER' in app_train.columns:
    gender_default = app_train.groupby('CODE_GENDER')['TARGET'].mean() * 100
    for gender, rate in gender_default.items():
        print(f"     * {gender}: {rate:.2f}% default rate")

if 'NAME_EDUCATION_TYPE' in app_train.columns:
    edu_default = app_train.groupby('NAME_EDUCATION_TYPE')['TARGET'].mean() * 100
    print(f"   - Education impact: Lower education = higher default risk")

print("\n4. NEXT STEPS:")
print("   - Need to handle imbalanced data (oversampling/undersampling)")
print("   - Need to handle missing values")
print("   - Need feature engineering from multiple data sources")
print("   - Consider ensemble methods for better performance")

# %% [markdown]
# ## üíæ Save Exploration Results

# %%
# Save summary statistics
summary_stats = {
    'target_distribution': dict(target_pct) if 'TARGET' in app_train.columns else {},
    'data_shape': app_train.shape,
    'missing_values': missing.sum(),
    'missing_percentage': 100 * missing.sum() / (app_train.shape[0] * app_train.shape[1]),
    'numeric_features': len(numeric_cols),
    'categorical_features': len(categorical_cols)
}

# Save to file
import json
with open('exploration_summary.json', 'w') as f:
    json.dump(summary_stats, f, indent=4)

print("Exploration summary saved to exploration_summary.json")