In [7]:
import pandas as pd
import numpy as np

# Load the main application data
df = pd.read_csv('/Users/nived/Documents/CreditRisk360/data/home-credit-default-risk/application_train.csv')

# Basic overview
print("Shape:", df.shape)
display(df.head())

# Handle missing values (remove columns with >40% nulls)
nulls = df.isnull().mean()
df = df.drop(columns=nulls[nulls > 0.4].index)

# Fill remaining missing values (numerical with median, categorical with mode)
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

# Encode categoricals
df = pd.get_dummies(df, drop_first=True)

# Save preprocessed file
df.to_csv('../data/clean_application.csv', index=False)


Shape: (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [1]:
# Load the cleaned dataset
df = pd.read_csv('../data/clean_application.csv')

# Check class balance
sns.countplot(x='TARGET', data=df)
plt.title('Loan Repayment Status (0 = Paid, 1 = Default)')
plt.show()

# Check correlation with target
correlations = df.corr()['TARGET'].sort_values(ascending=False)

print("Top Positive Correlations:\n", correlations.head(10))
print("\nTop Negative Correlations:\n", correlations.tail(10))

# Plot top features correlated with TARGET
top_corr_features = correlations.abs().sort_values(ascending=False)[1:11].index  # Skip 'TARGET' itself

plt.figure(figsize=(10, 6))
sns.heatmap(df[top_corr_features].corr(), annot=True, cmap='coolwarm')
plt.title('Top Features Correlated with Default Risk')
plt.show()

# Distribution of a key numerical feature
sns.kdeplot(df[df['TARGET'] == 0]['AMT_INCOME_TOTAL'], label='Paid', shade=True)
sns.kdeplot(df[df['TARGET'] == 1]['AMT_INCOME_TOTAL'], label='Default', shade=True)
plt.title('Income Distribution by Loan Status')
plt.legend()
plt.show()


NameError: name 'pd' is not defined