# EDA for Credit Card Dataset

This notebook is organized into the following sections:

1. Install Dependencies
2. Imports
3. Load Data
4. Data Inspection
5. Data Cleaning
6. Descriptive Statistics
7. Correlation Analysis
8. Correlation Heatmap
9. Collinearity Analysis (VIF)
10. Feature-Target Separation and Train/Validation/Test Split

## 1) Install Dependencies

In [None]:
%pip install scikit-learn statsmodels

## 2) Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

## 3) Load Data

In [None]:
# Load the dataset
df = pd.read_csv('data/UCI_Credit_card.csv')

# Initial inspection
print('Data shape:', df.shape)
print(df.head())

## 4) Data Inspection

In [None]:
# Data types
print(df.dtypes)

# Missing values
print('\nMissing values:')
print(df.isnull().sum())

# Duplicate rows
num_duplicates = df.duplicated().sum()
print(f'\nNumber of duplicate rows: {num_duplicates}')
df = df.drop_duplicates()

## 5) Data Cleaning

In [None]:
# Drop irrelevant columns
df = df.drop(columns=['ID'])

# Recode EDUCATION: 0, 5, 6 -> 4
df['EDUCATION'] = df['EDUCATION'].replace({0: 4, 5: 4, 6: 4})

# Recode MARRIAGE: 0 -> 3
df['MARRIAGE'] = df['MARRIAGE'].replace({0: 3})

# Inspect cleaned data
print('After cleaning, shape:', df.shape)
print(df.head())

## 6) Descriptive Statistics

In [None]:
print(df.describe())

## 7) Correlation Analysis

In [None]:
# Compute correlation matrix for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
corr_matrix = df[numeric_cols].corr()
print('Correlation Matrix:') 
print(corr_matrix)

# Identify highly correlated pairs (|corr| > 0.8)
threshold = 0.8
high_corr_pairs = [
    (numeric_cols[i], numeric_cols[j], corr_matrix.iloc[i, j])
    for i in range(len(numeric_cols))
    for j in range(i + 1, len(numeric_cols))
    if abs(corr_matrix.iloc[i, j]) > threshold
]
if high_corr_pairs:
    print(f'\nHighly correlated pairs (|corr| > {threshold}):')
    for var1, var2, val in high_corr_pairs:
        print(f'{var1} - {var2}: {val:.2f}')
else:
    print(f'\nNo pairs with |corr| > {threshold}')

## 8) Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False, fmt='.2f',
            xticklabels=numeric_cols, yticklabels=numeric_cols)
plt.title('Correlation Matrix Heatmap', fontsize=16)
plt.xticks(rotation=90, fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

## 9) Collinearity Analysis (VIF)

In [None]:
# Calculate Variance Inflation Factor (VIF)
X = df[numeric_cols].dropna().copy()
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [
    variance_inflation_factor(X.values, i) 
    for i in range(X.shape[1])
]
print(vif_data)

## 10) Feature-Target Separation and Train/Validation/Test Split

In [None]:
target_col = 'default.payment.next.month'
X = df.drop(columns=[target_col])
y = df[target_col]

# Split 60% train, 20% val, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, 
    train_size=0.6, 
    stratify=y, 
    random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5, 
    stratify=y_temp, 
    random_state=42
)

# Display shapes
print(f'Training set: X={X_train.shape}, y={y_train.shape}')
print(f'Validation set: X={X_val.shape}, y={y_val.shape}')
print(f'Test set: X={X_test.shape}, y={y_test.shape}')