# Loan Prediction - Complete Data Exploration & Analysis**Date:** December 23, 2025  **Goal:** Comprehensive understanding of the loan dataset to identify patterns and build predictive models## Dataset Overview- **Source:** Kaggle Loan Prediction Dataset- **Purpose:** Predict loan approval based on applicant information- **Datasets:** Training data (with target) and Test data (without target)---

## 1. Import Libraries & Load Data

In [None]:
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom matplotlib.lines import Line2Dimport warningsimport oswarnings.filterwarnings('ignore')# Set style for professional visualizationsplt.style.use('ggplot')sns.set_style('whitegrid')sns.set_palette('Set2')plt.rcParams['figure.figsize'] = (10, 6)plt.rcParams['font.size'] = 11# ‚úÖ CREATE FOLDER FOR SAVING VISUALIZATIONSviz_folder = '../visualizations/eda_plots'os.makedirs(viz_folder, exist_ok=True)print("="*70)print("üìÅ VISUALIZATION FOLDER CREATED")print("="*70)print(f"All plots will be saved to: {viz_folder}")print("="*70 + "\n")# Load datasetstrain_df = pd.read_csv('../data/train_u6lujuX_CVtuZ9i.csv')test_df = pd.read_csv('../data/test_Y3wMUE5_7gLdaTN.csv')print("="*70)print("DATASET LOADED SUCCESSFULLY")print("="*70)print(f"Training Dataset shape: {train_df.shape}")print(f"Test Dataset shape: {test_df.shape}")print("\n" + "="*70)

## 2. Initial Data Inspection

In [None]:
# Display first few rowsprint("Training Data - First 5 rows:")display(train_df.head())print("\n" + "="*70 + "\n")

### Features Overview:- **Loan_ID**: Unique identifier for each loan application- **Gender**: Male/Female- **Married**: Yes/No- **Dependents**: Number of dependents (0, 1, 2, 3+)- **Education**: Graduate/Not Graduate- **Self_Employed**: Yes/No- **ApplicantIncome**: Applicant's income- **CoapplicantIncome**: Co-applicant's income- **LoanAmount**: Loan amount requested (in thousands)- **Loan_Amount_Term**: Term of loan (in months)- **Credit_History**: Credit history meets guidelines (1/0)- **Property_Area**: Urban/Semiurban/Rural- **Loan_Status**: Target variable (Y=Approved, N=Rejected)

In [None]:
# Dataset information - Training Dataprint("="*70)print("TRAINING DATA INFORMATION")print("="*70)train_df.info()print("\n" + "="*70 + "\n")# Dataset information - Test Dataprint("="*70)print("TEST DATA INFORMATION")print("="*70)test_df.info()

In [None]:
# Compare datasetsprint("="*70)print("COMPARING TRAIN VS TEST DATA")print("="*70)print(f"\nTrain shape: {train_df.shape}")print(f"Test shape: {test_df.shape}")print(f"\nColumns only in train: {set(train_df.columns) - set(test_df.columns)}")print(f"Columns only in test: {set(test_df.columns) - set(train_df.columns)}")

## 3. Missing Values Analysis

In [None]:
# Missing values - Training Dataprint("="*70)print("MISSING VALUES ANALYSIS")print("="*70)missing_train = train_df.isnull().sum()missing_percent_train = (missing_train / len(train_df)) * 100missing_df_train = pd.DataFrame({    'Missing_Count': missing_train,    'Percentage': missing_percent_train.round(2)})print("\nTRAINING DATA:")print(missing_df_train[missing_df_train['Missing_Count'] > 0].sort_values('Percentage', ascending=False))# Missing values - Test Datamissing_test = test_df.isnull().sum()missing_percent_test = (missing_test / len(test_df)) * 100missing_df_test = pd.DataFrame({    'Missing_Count': missing_test,    'Percentage': missing_percent_test.round(2)})print("\nTEST DATA:")print(missing_df_test[missing_df_test['Missing_Count'] > 0].sort_values('Percentage', ascending=False))

In [None]:
# Visualize missing valuesfig, axes = plt.subplots(1, 2, figsize=(14, 6))# Training datamissing_train_plot = missing_df_train[missing_df_train['Missing_Count'] > 0].sort_values('Percentage')axes[0].barh(missing_train_plot.index, missing_train_plot['Percentage'], color='#3498db', edgecolor='black')axes[0].set_xlabel('Percentage Missing (%)', fontweight='bold')axes[0].set_title('Missing Values - Training Data', fontweight='bold', fontsize=14)axes[0].grid(axis='x', alpha=0.3)# Test datamissing_test_plot = missing_df_test[missing_df_test['Missing_Count'] > 0].sort_values('Percentage')axes[1].barh(missing_test_plot.index, missing_test_plot['Percentage'], color='#e74c3c', edgecolor='black')axes[1].set_xlabel('Percentage Missing (%)', fontweight='bold')axes[1].set_title('Missing Values - Test Data', fontweight='bold', fontsize=14)axes[1].grid(axis='x', alpha=0.3)plt.tight_layout()plt.savefig(f'{viz_folder}/01_missing_values_analysis.png', dpi=300, bbox_inches='tight')plt.show()

## 4. Target Variable Analysis (Loan_Status)

In [None]:
# Target variable distributionprint("="*70)print("TARGET VARIABLE DISTRIBUTION (LOAN_STATUS)")print("="*70)print("\nValue Counts:")print(train_df['Loan_Status'].value_counts())print("\nPercentage:")print(train_df['Loan_Status'].value_counts(normalize=True).mul(100).round(2))

In [None]:
# Visualizationfig, axes = plt.subplots(1, 2, figsize=(14, 6))# Bar chartloan_counts = train_df['Loan_Status'].value_counts()colors = ['#2ecc71', '#e74c3c']  # Green for approved, red for rejectedbars = axes[0].bar(loan_counts.index, loan_counts.values, color=colors, edgecolor='black', linewidth=1.5)axes[0].set_xlabel('Loan Status', fontsize=12, fontweight='bold')axes[0].set_ylabel('Count', fontsize=12, fontweight='bold')axes[0].set_title('Loan Approval Distribution (Bar Chart)', fontsize=14, fontweight='bold')axes[0].set_xticklabels(['Approved (Y)', 'Rejected (N)'])axes[0].grid(axis='y', alpha=0.3)# Add percentage labels on barsfor i, v in enumerate(loan_counts.values):    pct = v/len(train_df)*100    axes[0].text(i, v + 10, f'{v}\n({pct:.1f}%)', ha='center', fontweight='bold', fontsize=11)# Pie chartaxes[1].pie(loan_counts.values,            labels=['Approved (Y)', 'Rejected (N)'],           autopct='%1.1f%%',           colors=colors,           startangle=90,           explode=(0.05, 0),           textprops={'fontsize': 12, 'fontweight': 'bold'})axes[1].set_title('Loan Approval Distribution (Pie Chart)', fontsize=14, fontweight='bold')plt.tight_layout()plt.savefig(f'{viz_folder}/02_target_variable_distribution.png', dpi=300, bbox_inches='tight')plt.show()print("\n‚ö†Ô∏è Dataset is somewhat imbalanced: 68.7% approved vs 31.3% rejected")

## 5. Statistical Summary

In [None]:
# Statistical summary - Training Dataprint("="*70)print("STATISTICAL SUMMARY - TRAINING DATA")print("="*70)display(train_df.describe())print("\n" + "="*70 + "\n")# Statistical summary - Test Dataprint("="*70)print("STATISTICAL SUMMARY - TEST DATA")print("="*70)display(test_df.describe())

## 6. Numerical Features Distribution

In [None]:
# Applicant Income Distributionplt.figure(figsize=(10, 6))bins = [0, 2500, 5000, 7500, 10000, 12500, 15000, 20000, 25000]plt.hist(train_df['ApplicantIncome'], bins=bins, color='#3498db', edgecolor='black', alpha=0.7)plt.axvline(train_df['ApplicantIncome'].median(), color='red', linestyle='--',            linewidth=2, label=f'Median: {train_df["ApplicantIncome"].median():.0f}')plt.title('Distribution of Applicant Income', fontsize=14, fontweight='bold')plt.xlabel('Applicant Income', fontweight='bold')plt.ylabel('Number of Applicants', fontweight='bold')plt.xticks(bins, rotation=45)plt.legend()plt.grid(axis='y', alpha=0.3)plt.tight_layout()plt.savefig(f'{viz_folder}/03_applicant_income_distribution.png', dpi=300, bbox_inches='tight')plt.show()

In [None]:
# Loan Amount Distributionplt.figure(figsize=(10, 6))loan_bins = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700]plt.hist(train_df['LoanAmount'].dropna(), bins=loan_bins, color='#e74c3c', edgecolor='black', alpha=0.7)plt.axvline(train_df['LoanAmount'].median(), color='blue', linestyle='--',            linewidth=2, label=f'Median: {train_df["LoanAmount"].median():.0f}')plt.title('Distribution of Loan Amount (in thousands)', fontsize=14, fontweight='bold')plt.xlabel('Loan Amount', fontweight='bold')plt.ylabel('Number of Loans', fontweight='bold')plt.xticks(loan_bins[::2], rotation=45)plt.legend()plt.grid(axis='y', alpha=0.3)plt.tight_layout()plt.savefig(f'{viz_folder}/04_loan_amount_distribution.png', dpi=300, bbox_inches='tight')plt.show()

In [None]:
# Combined Income Analysistrain_df['TotalIncome'] = train_df['ApplicantIncome'] + train_df['CoapplicantIncome']fig, axes = plt.subplots(1, 3, figsize=(16, 5))fig.suptitle('Income Distribution Analysis', fontsize=16, fontweight='bold')# Applicant Incomeaxes[0].hist(train_df['ApplicantIncome'], bins=40, color='#3498db', edgecolor='black', alpha=0.7)axes[0].axvline(train_df['ApplicantIncome'].median(), color='red', linestyle='--', linewidth=2)axes[0].set_xlabel('Applicant Income', fontweight='bold')axes[0].set_ylabel('Frequency', fontweight='bold')axes[0].set_title('Applicant Income', fontweight='bold')axes[0].grid(axis='y', alpha=0.3)# Coapplicant Incomeaxes[1].hist(train_df['CoapplicantIncome'], bins=40, color='#9b59b6', edgecolor='black', alpha=0.7)axes[1].axvline(train_df['CoapplicantIncome'].median(), color='red', linestyle='--', linewidth=2)axes[1].set_xlabel('Coapplicant Income', fontweight='bold')axes[1].set_ylabel('Frequency', fontweight='bold')axes[1].set_title('Coapplicant Income', fontweight='bold')axes[1].grid(axis='y', alpha=0.3)# Total Incomeaxes[2].hist(train_df['TotalIncome'], bins=40, color='#2ecc71', edgecolor='black', alpha=0.7)axes[2].axvline(train_df['TotalIncome'].median(), color='red', linestyle='--', linewidth=2)axes[2].set_xlabel('Total Income', fontweight='bold')axes[2].set_ylabel('Frequency', fontweight='bold')axes[2].set_title('Total Income (Combined)', fontweight='bold')axes[2].grid(axis='y', alpha=0.3)plt.tight_layout()plt.savefig(f'{viz_folder}/05_income_distribution_combined.png', dpi=300, bbox_inches='tight')plt.show()

## 7. Credit History Impact Analysis (CRITICAL)

In [None]:
# Credit History vs Loan Statusprint("="*70)print("CREDIT HISTORY IMPACT ON LOAN APPROVAL")print("="*70)credit_loan = pd.crosstab(train_df['Credit_History'], train_df['Loan_Status'], margins=True)print("\nCrosstab (Counts):")print(credit_loan)credit_loan_pct = pd.crosstab(train_df['Credit_History'], train_df['Loan_Status'], normalize='index') * 100print("\nApproval Rate by Credit History:")print(credit_loan_pct.round(2))

In [None]:
# Visualization - Credit History Impactfig, axes = plt.subplots(1, 3, figsize=(18, 6))fig.suptitle('Credit History Impact on Loan Approval', fontsize=16, fontweight='bold')# Bar chart - Approval ratescredit_loan_pct.plot(kind='bar', ax=axes[0], color=['#e74c3c', '#2ecc71'],                      edgecolor='black', linewidth=1.5, width=0.6)axes[0].set_xlabel('Credit History (0=Bad, 1=Good)', fontweight='bold', fontsize=12)axes[0].set_ylabel('Percentage (%)', fontweight='bold', fontsize=12)axes[0].set_title('Loan Approval Rate by Credit History', fontweight='bold', fontsize=12)axes[0].legend(['Rejected (N)', 'Approved (Y)'], loc='best')axes[0].set_xticklabels(['Bad Credit (0)', 'Good Credit (1)'], rotation=0)axes[0].grid(axis='y', alpha=0.3)# Pie chart - Good Credit (1)credit_1 = train_df[train_df['Credit_History'] == 1]['Loan_Status'].value_counts()labels_1 = [f'Approved\n{credit_1["Y"]} ({credit_1["Y"]/credit_1.sum()*100:.1f}%)',            f'Rejected\n{credit_1["N"]} ({credit_1["N"]/credit_1.sum()*100:.1f}%)']axes[1].pie(credit_1, labels=labels_1, autopct='', colors=['#2ecc71', '#e74c3c'],            startangle=90, textprops={'fontsize': 11, 'fontweight': 'bold'})axes[1].set_title('Good Credit History (1)', fontweight='bold', fontsize=12)# Pie chart - Bad Credit (0)credit_0 = train_df[train_df['Credit_History'] == 0]['Loan_Status'].value_counts()labels_0 = [f'Approved\n{credit_0["Y"]} ({credit_0["Y"]/credit_0.sum()*100:.1f}%)',            f'Rejected\n{credit_0["N"]} ({credit_0["N"]/credit_0.sum()*100:.1f}%)']axes[2].pie(credit_0, labels=labels_0, autopct='', colors=['#2ecc71', '#e74c3c'],            startangle=90, textprops={'fontsize': 11, 'fontweight': 'bold'})axes[2].set_title('Bad Credit History (0)', fontweight='bold', fontsize=12)plt.tight_layout()plt.savefig(f'{viz_folder}/06_credit_history_impact.png', dpi=300, bbox_inches='tight')plt.show()print("\nüîë KEY INSIGHT: Credit History is the STRONGEST predictor of loan approval!")print(f"   - Good Credit (1): {credit_loan_pct.loc[1.0, 'Y']:.1f}% approval rate")print(f"   - Bad Credit (0): {credit_loan_pct.loc[0.0, 'Y']:.1f}% approval rate")

## 8. Property Area Analysis

In [None]:
# Property Area vs Loan Statusproperty_approval = train_df.groupby('Property_Area')['Loan_Status'].apply(    lambda x: (x == 'Y').mean() * 100).sort_values()plt.figure(figsize=(10, 6))bars = plt.bar(property_approval.index, property_approval,               color=['#f39c12', '#2ecc71', '#3498db'], edgecolor='black', linewidth=1.5)# Add percentage labels on top of barsfor bar in bars:    height = bar.get_height()    plt.text(bar.get_x() + bar.get_width()/2., height + 1,             f'{height:.1f}%', ha='center', fontweight='bold', fontsize=12)plt.title('Loan Approval Rate by Property Area', fontsize=14, fontweight='bold')plt.xlabel('Property Area', fontweight='bold', fontsize=12)plt.ylabel('Approval Percentage (%)', fontweight='bold', fontsize=12)plt.ylim(0, 100)plt.grid(axis='y', alpha=0.3)plt.tight_layout()plt.savefig(f'{viz_folder}/07_property_area_approval.png', dpi=300, bbox_inches='tight')plt.show()print("\nApproval Rates by Property Area:")for area, rate in property_approval.items():    print(f"  {area}: {rate:.2f}%")

## 9. Education Level Analysis

In [None]:
# Box Plot: Income by Education Levelplt.figure(figsize=(10, 6))graduate = train_df[train_df['Education'] == 'Graduate']['ApplicantIncome']non_graduate = train_df[train_df['Education'] == 'Not Graduate']['ApplicantIncome']boxes = plt.boxplot([graduate, non_graduate],                    labels=['Graduate', 'Not Graduate'],                    patch_artist=True,                    medianprops={'linewidth': 2, 'color': 'red'},                   widths=0.6)# Customize box appearancecolors = ['#3498db', '#e74c3c']for patch, color in zip(boxes['boxes'], colors):    patch.set_facecolor(color)    patch.set_alpha(0.7)    patch.set_edgecolor('black')    patch.set_linewidth(1.5)plt.title('Applicant Income by Education Level', fontsize=14, fontweight='bold')plt.ylabel('Applicant Income', fontweight='bold', fontsize=12)plt.grid(axis='y', alpha=0.3)plt.tight_layout()plt.savefig(f'{viz_folder}/08_education_income_boxplot.png', dpi=300, bbox_inches='tight')plt.show()print(f"Graduate Median Income: {graduate.median():.2f}")print(f"Non-Graduate Median Income: {non_graduate.median():.2f}")

## 10. Loan Amount vs Income Analysis

In [None]:
# Scatter Plot: Loan Amount vs Applicant Incomeplt.figure(figsize=(12, 7))colors_map = train_df['Loan_Status'].map({'Y': '#2ecc71', 'N': '#e74c3c'})plt.scatter(train_df['ApplicantIncome'], train_df['LoanAmount'],            c=colors_map, alpha=0.6, edgecolors='w', s=50)plt.title('Loan Amount vs Applicant Income (colored by Loan Status)', fontsize=14, fontweight='bold')plt.xlabel('Applicant Income', fontweight='bold', fontsize=12)plt.ylabel('Loan Amount (thousands)', fontweight='bold', fontsize=12)plt.grid(alpha=0.3)# Add legendlegend_elements = [    Line2D([0], [0], marker='o', color='w', markerfacecolor='#2ecc71',            markersize=10, label='Approved (Y)'),    Line2D([0], [0], marker='o', color='w', markerfacecolor='#e74c3c',            markersize=10, label='Rejected (N)')]plt.legend(handles=legend_elements, loc='upper right', fontsize=11)plt.tight_layout()plt.savefig(f'{viz_folder}/09_loan_vs_income_scatter.png', dpi=300, bbox_inches='tight')plt.show()

## 11. Categorical Features vs Loan Status

In [None]:
# Analyze all categorical features against loan statuscategorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed']fig, axes = plt.subplots(2, 3, figsize=(16, 10))fig.suptitle('Categorical Features vs Loan Approval Status', fontsize=16, fontweight='bold')for idx, feature in enumerate(categorical_features):    row = idx // 3    col = idx % 3        # Create crosstab    ct = pd.crosstab(train_df[feature], train_df['Loan_Status'], normalize='index') * 100        ct.plot(kind='bar', ax=axes[row, col], color=['#e74c3c', '#2ecc71'],             edgecolor='black', linewidth=1.5, width=0.7)    axes[row, col].set_xlabel(feature, fontweight='bold', fontsize=11)    axes[row, col].set_ylabel('Percentage (%)', fontweight='bold', fontsize=11)    axes[row, col].set_title(f'{feature} vs Loan Status', fontweight='bold', fontsize=12)    axes[row, col].legend(['Rejected (N)', 'Approved (Y)'], loc='best')    axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45, ha='right')    axes[row, col].grid(axis='y', alpha=0.3)# Remove extra subplotfig.delaxes(axes[1, 2])plt.tight_layout()plt.savefig(f'{viz_folder}/10_categorical_vs_loan_status.png', dpi=300, bbox_inches='tight')plt.show()# Print approval ratesprint("="*70)print("APPROVAL RATES BY CATEGORICAL FEATURES")print("="*70)for feature in categorical_features:    print(f"\n{feature}:")    ct = pd.crosstab(train_df[feature], train_df['Loan_Status'], normalize='index') * 100    print(ct.round(2))

## 12. Correlation Analysis

In [None]:
# Create a copy for correlation analysiscorr_df = train_df.copy()# Encode target variablecorr_df['Loan_Status_Encoded'] = corr_df['Loan_Status'].map({'Y': 1, 'N': 0})# Select numerical columnsnumerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',                  'Loan_Amount_Term', 'Credit_History', 'TotalIncome', 'Loan_Status_Encoded']# Calculate correlation matrixcorrelation_matrix = corr_df[numerical_cols].corr()# Heatmapplt.figure(figsize=(12, 8))sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',             center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8},            vmin=-1, vmax=1)plt.title('Correlation Matrix - Numerical Features', fontsize=16, fontweight='bold', pad=20)plt.tight_layout()plt.savefig(f'{viz_folder}/11_correlation_heatmap.png', dpi=300, bbox_inches='tight')plt.show()# Print correlations with targetprint("="*70)print("CORRELATION WITH LOAN APPROVAL")print("="*70)target_corr = correlation_matrix['Loan_Status_Encoded'].sort_values(ascending=False)print(target_corr)print("\nüîë Credit_History has the strongest correlation with loan approval!")

## 13. Outlier Detection

In [None]:
# Identify outliers using IQR methodprint("="*70)print("OUTLIER DETECTION (IQR METHOD)")print("="*70)def detect_outliers(df, column):    Q1 = df[column].quantile(0.25)    Q3 = df[column].quantile(0.75)    IQR = Q3 - Q1    lower_bound = Q1 - 1.5 * IQR    upper_bound = Q3 + 1.5 * IQR    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]    return len(outliers), lower_bound, upper_boundoutlier_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'TotalIncome']for feature in outlier_features:    count, lower, upper = detect_outliers(train_df, feature)    pct = (count / len(train_df)) * 100    print(f"\n{feature}:")    print(f"  Outliers: {count} ({pct:.2f}%)")    print(f"  Normal range: [{lower:.2f}, {upper:.2f}]")

In [None]:
# Visualize outliersfig, axes = plt.subplots(2, 2, figsize=(14, 10))fig.suptitle('Outlier Detection - Box Plots', fontsize=16, fontweight='bold')for idx, feature in enumerate(outlier_features):    row = idx // 2    col = idx % 2        axes[row, col].boxplot(train_df[feature].dropna(),                           patch_artist=True,                          boxprops=dict(facecolor='#3498db', alpha=0.7, edgecolor='black', linewidth=1.5),                          medianprops=dict(color='red', linewidth=2),                          widths=0.5)    axes[row, col].set_ylabel(feature, fontweight='bold', fontsize=11)    axes[row, col].set_title(f'{feature} - Outlier Detection', fontweight='bold', fontsize=12)    axes[row, col].grid(axis='y', alpha=0.3)plt.tight_layout()plt.savefig(f'{viz_folder}/12_outlier_detection.png', dpi=300, bbox_inches='tight')plt.show()

## 14. Feature Engineering Insights

In [None]:
# Create new featurestrain_df['Loan_Income_Ratio'] = train_df['LoanAmount'] / train_df['TotalIncome']# Visualize new featureplt.figure(figsize=(10, 6))approved_ratio = train_df[train_df['Loan_Status'] == 'Y']['Loan_Income_Ratio'].dropna()rejected_ratio = train_df[train_df['Loan_Status'] == 'N']['Loan_Income_Ratio'].dropna()boxes = plt.boxplot([approved_ratio, rejected_ratio],                    labels=['Approved (Y)', 'Rejected (N)'],                   patch_artist=True,                   medianprops={'linewidth': 2, 'color': 'red'},                   widths=0.6)colors = ['#2ecc71', '#e74c3c']for patch, color in zip(boxes['boxes'], colors):    patch.set_facecolor(color)    patch.set_alpha(0.7)    patch.set_edgecolor('black')    patch.set_linewidth(1.5)plt.ylabel('Loan to Income Ratio', fontweight='bold', fontsize=12)plt.title('Loan to Income Ratio by Loan Status', fontweight='bold', fontsize=14)plt.grid(axis='y', alpha=0.3)plt.tight_layout()plt.savefig(f'{viz_folder}/13_loan_income_ratio.png', dpi=300, bbox_inches='tight')plt.show()print("="*70)print("NEW FEATURES CREATED")print("="*70)print("1. TotalIncome: ApplicantIncome + CoapplicantIncome")print("2. Loan_Income_Ratio: LoanAmount / TotalIncome")print("\nüí° These features may improve model performance!")

## 15. Summary Statistics

In [None]:
# Print column informationprint("="*70)print("COLUMNS IN TRAINING DATASET")print("="*70)for i, col in enumerate(train_df.columns, 1):    print(f"{i}. {col} - {train_df[col].dtype}")print("\n" + "="*70 + "\n")print("="*70)print("COLUMNS IN TEST DATASET")print("="*70)for i, col in enumerate(test_df.columns, 1):    print(f"{i}. {col} - {test_df[col].dtype}")

## 16. Key Insights & Findings### üìä Summary of Key Findings:#### **1. Target Variable (Loan_Status)**- ‚úÖ **68.7%** of loans are approved (Y)- ‚ùå **31.3%** of loans are rejected (N)- Dataset is moderately imbalanced#### **2. Missing Data**- **Credit_History**: 8.14% missing (CRITICAL - needs imputation)- **Self_Employed**: 5.21% missing- **LoanAmount**: 3.58% missing- **Other features**: <3% missing#### **3. Most Important Features (Based on Analysis)**1. üîë **Credit_History** - Strongest predictor (~80% approval with good credit vs ~7% with bad credit)2. üí∞ **TotalIncome** (Applicant + Coapplicant)3. üéì **Education** - Graduates have higher approval rates4. üíç **Married** - Married applicants have higher approval rates5. üèòÔ∏è **Property_Area** - Semiurban has highest approval rate#### **4. Income Patterns**- Median Applicant Income: **3,812**- Median Coapplicant Income: **1,188**- Median Total Income: **5,188**- Significant outliers exist (max income: 81,000)- Approved loans have slightly higher median income#### **5. Loan Characteristics**- Most common loan term: **360 months (30 years)**- Median loan amount: **128,000**- Loan amounts range from 9,000 to 700,000#### **6. Categorical Insights**- **Gender**: Male applicants dominate (80%+)- **Married**: ~65% of applicants are married- **Education**: ~78% are graduates- **Self_Employed**: Only ~14% are self-employed- **Dependents**: Most have 0-1 dependents#### **7. Correlations**- **Credit_History** has the strongest correlation with loan approval- **LoanAmount** and **TotalIncome** are positively correlated- **ApplicantIncome** and **CoapplicantIncome** are weakly correlated---### üéØ Recommendations for Model Building:1. **Handle Missing Values:**   - Impute Credit_History (mode or predictive imputation)   - Impute Self_Employed (mode)   - Impute LoanAmount (median or mean)2. **Feature Engineering:**   - Create TotalIncome = ApplicantIncome + CoapplicantIncome ‚úÖ   - Create Loan_Income_Ratio ‚úÖ   - Consider log transformation for skewed features3. **Handle Outliers:**   - Consider capping extreme income values   - Or use robust models (tree-based)4. **Encoding:**   - One-hot encode: Property_Area, Dependents   - Label encode: Gender, Married, Education, Self_Employed5. **Class Imbalance:**   - Consider SMOTE or class weights   - Use stratified sampling6. **Model Selection:**   - Start with Logistic Regression (baseline)   - Try Random Forest, XGBoost, LightGBM   - Ensemble methods may work well7. **Evaluation Metrics:**   - Use Accuracy, Precision, Recall, F1-Score   - ROC-AUC for probability predictions   - Confusion Matrix for detailed analysis---### ‚úÖ Next Steps:1. **Data Preprocessing** (Notebook 02)2. **Feature Engineering** (Notebook 03)3. **Model Building** (Notebook 04)4. **Model Evaluation & Tuning** (Notebook 05)5. **Final Predictions** (Notebook 06)

## 17. Final Summary & File List

In [None]:
# ‚úÖ FINAL SUMMARY - List all saved visualizationsprint("="*70)print("‚úÖ DATA EXPLORATION COMPLETE!")print("="*70)print(f"\nüìÅ All visualizations saved to: {viz_folder}\n")# List all saved filessaved_files = [    "01_missing_values_analysis.png",    "02_target_variable_distribution.png",    "03_applicant_income_distribution.png",    "04_loan_amount_distribution.png",    "05_income_distribution_combined.png",    "06_credit_history_impact.png",    "07_property_area_approval.png",    "08_education_income_boxplot.png",    "09_loan_vs_income_scatter.png",    "10_categorical_vs_loan_status.png",    "11_correlation_heatmap.png",    "12_outlier_detection.png",    "13_loan_income_ratio.png"]print("üìä Saved Visualizations:")for i, file in enumerate(saved_files, 1):    file_path = os.path.join(viz_folder, file)    if os.path.exists(file_path):        file_size = os.path.getsize(file_path) / 1024  # KB        print(f"  {i:2d}. ‚úÖ {file} ({file_size:.1f} KB)")    else:        print(f"  {i:2d}. ‚è≥ {file} (Will be created when cell runs)")print("\n" + "="*70)print("üéØ Ready to proceed with data preprocessing and model building.")print("üí° Key Insight: Credit History is the strongest predictor!")print("   - Good Credit (1): ~80% approval rate")print("   - Bad Credit (0): ~7% approval rate")print("="*70)

---### üìÅ Generated Visualizations:1. `01_missing_values_analysis.png`2. `02_target_variable_distribution.png`3. `03_applicant_income_distribution.png`4. `04_loan_amount_distribution.png`5. `05_income_distribution_combined.png`6. `06_credit_history_impact.png` ‚≠ê **MOST IMPORTANT**7. `07_property_area_approval.png`8. `08_education_income_boxplot.png`9. `09_loan_vs_income_scatter.png`10. `10_categorical_vs_loan_status.png`11. `11_correlation_heatmap.png`12. `12_outlier_detection.png`13. `13_loan_income_ratio.png`---**Date:** December 25, 2025  **Status:** ‚úÖ Complete and Ready for Model Building  **Key Finding:** Credit History is the strongest predictor of loan approval