In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load the dataset
file_path = "loan_data.csv"  # Replace with your CSV file path
save_dir = "samples/"  # Directory to save the plots

# Ensure the directory exists
os.makedirs(save_dir, exist_ok=True)

# Specify the columns to use for EDA
columns_to_consider = [
    'EXT_SOURCE_3', 'EXT_SOURCE_2', 'DAYS_BIRTH', 'DAYS_LAST_PHONE_CHANGE',
    'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT', 'NAME_CONTRACT_STATUS',
    'NAME_INCOME_TYPE', 'DAYS_ID_PUBLISH', 'TARGET'
]
df = pd.read_csv(file_path, usecols=columns_to_consider)

# 1. Correlation Heatmap (Numeric Columns vs TARGET)
numeric_cols = df.select_dtypes(include="number").columns
plt.figure(figsize=(10, 8))
sns.heatmap(df[numeric_cols].corr()[['TARGET']].sort_values(by='TARGET', ascending=False), annot=True, cmap="coolwarm")
plt.title("Correlation with TARGET")
plt.savefig(os.path.join(save_dir, "correlation_with_target.png"))
plt.close()

# 2. Distribution Plot (Numeric Columns vs TARGET)
for col in numeric_cols:
    if col != 'TARGET':
        plt.figure(figsize=(8, 6))
        sns.histplot(data=df, x=col, hue='TARGET', kde=True, multiple="stack")
        plt.title(f"Distribution of {col} by TARGET")
        plt.savefig(os.path.join(save_dir, f"distribution_{col}_by_target.png"))
        plt.close()

# 3. Box Plot (Numeric Columns vs TARGET)
for col in numeric_cols:
    if col != 'TARGET':
        plt.figure(figsize=(8, 6))
        sns.boxplot(x='TARGET', y=col, data=df)
        plt.title(f"Box Plot of {col} vs TARGET")
        plt.savefig(os.path.join(save_dir, f"box_plot_{col}_vs_target.png"))
        plt.close()

# 4. Count Plot (Categorical Columns vs TARGET)
categorical_cols = df.select_dtypes(include="object").columns
for col in categorical_cols:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=col, hue='TARGET', data=df)
    plt.title(f"Count Plot of {col} by TARGET")
    plt.savefig(os.path.join(save_dir, f"count_plot_{col}_by_target.png"))
    plt.close()

# 5. Scatter Plot (Numeric Columns vs TARGET)
for col in numeric_cols:
    if col != 'TARGET':
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x=col, y='TARGET', data=df)
        plt.title(f"Scatter Plot of {col} vs TARGET")
        plt.savefig(os.path.join(save_dir, f"scatter_plot_{col}_vs_target.png"))
        plt.close()

# 6. Bar Plot (Categorical Columns vs TARGET)
for col in categorical_cols:
    plt.figure(figsize=(8, 6))
    sns.barplot(x=col, y='TARGET', data=df)
    plt.title(f"Bar Plot of {col} vs TARGET")
    plt.savefig(os.path.join(save_dir, f"bar_plot_{col}_vs_target.png"))
    plt.close()

# 7. Violin Plot (Numeric Columns vs TARGET)
for col in numeric_cols:
    if col != 'TARGET':
        plt.figure(figsize=(8, 6))
        sns.violinplot(x='TARGET', y=col, data=df)
        plt.title(f"Violin Plot of {col} vs TARGET")
        plt.savefig(os.path.join(save_dir, f"violin_plot_{col}_vs_target.png"))
        plt.close()

print(f"EDA plots saved to '{save_dir}'")

EDA plots saved to 'samples/'
