# Data Preprocessing - Home Credit Default Risk

## Goal
This notebook shows how to clean and prepare data for machine learning.

## Steps:
1. Load data
2. Handle missing values
3. Encode categorical variables
4. Scale numerical features
5. Save processed data

---


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import os

# Settings
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("Set2")
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")


Matplotlib is building the font cache; this may take a moment.


📚 Библиотеки успешно импортированы!
Pandas version: 2.3.2
NumPy version: 2.3.3


## 1. Load Data


In [None]:
# Load data
print("Loading data...")

# Load training data
train_df = pd.read_csv('../data_raw/application_train.csv')
print(f"Training data loaded: {train_df.shape}")

# Load test data
test_df = pd.read_csv('../data_raw/application_test.csv')
print(f"Test data loaded: {test_df.shape}")

# Load column descriptions
columns_desc = pd.read_csv('../data_raw/HomeCredit_columns_description.csv')
print(f"Column descriptions loaded: {columns_desc.shape}")

print("\nData overview:")
print(f"Total records: {len(train_df):,}")
print(f"Number of features: {train_df.shape[1]}")
print(f"Target variable: {train_df['TARGET'].value_counts().to_dict()}")
print(f"Default rate: {train_df['TARGET'].mean()*100:.2f}%")


In [None]:
# Check data types
print("Data types:")
print(train_df.dtypes.value_counts())

print("\nFirst 5 rows:")
train_df.head()


## 2. Handle Missing Values


In [None]:
# Check missing values
print("Missing values analysis:")

# Count missing values
missing_data = train_df.isnull().sum()
missing_percent = (missing_data / len(train_df)) * 100

# Create DataFrame with missing values info
missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing_Count': missing_data.values,
    'Missing_Percent': missing_percent.values
})

# Sort by missing percentage
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)

print(f"\nColumns with missing values ({len(missing_df)} out of {len(train_df.columns)}):")
print(missing_df.head(20))

# Plot missing values
plt.figure(figsize=(12, 8))
if len(missing_df) > 0:
    top_missing = missing_df.head(15)
    plt.barh(range(len(top_missing)), top_missing['Missing_Percent'])
    plt.yticks(range(len(top_missing)), top_missing['Column'])
    plt.xlabel('Missing Values (%)')
    plt.title('Top 15 Columns with Most Missing Values')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found!")


## 3. Process Missing Values


In [None]:
# Create copies for processing
train_processed = train_df.copy()
test_processed = test_df.copy()

print("Processing missing values...")

# Function to handle missing values
def handle_missing_values(df, is_train=True):
    """
    Handle missing values in dataset
    """
    df_processed = df.copy()
    
    # 1. Remove columns with >50% missing values
    high_missing_cols = []
    for col in df_processed.columns:
        if df_processed[col].isnull().sum() / len(df_processed) > 0.5:
            high_missing_cols.append(col)
    
    if high_missing_cols:
        print(f"Removing columns with >50% missing values: {high_missing_cols}")
        df_processed = df_processed.drop(columns=high_missing_cols)
    
    # 2. Fill numerical columns with median
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
    numeric_missing = [col for col in numeric_cols if df_processed[col].isnull().any()]
    
    if numeric_missing:
        print(f"Filling numerical columns with median: {len(numeric_missing)} columns")
        for col in numeric_missing:
            df_processed[col].fillna(df_processed[col].median(), inplace=True)
    
    # 3. Fill categorical columns with mode
    categorical_cols = df_processed.select_dtypes(include=['object']).columns
    categorical_missing = [col for col in categorical_cols if df_processed[col].isnull().any()]
    
    if categorical_missing:
        print(f"Filling categorical columns with mode: {len(categorical_missing)} columns")
        for col in categorical_missing:
            mode_value = df_processed[col].mode()[0] if not df_processed[col].mode().empty else 'Unknown'
            df_processed[col].fillna(mode_value, inplace=True)
    
    return df_processed

# Process data
train_processed = handle_missing_values(train_processed, is_train=True)
test_processed = handle_missing_values(test_processed, is_train=False)

print(f"\nProcessing completed!")
print(f"Train size after processing: {train_processed.shape}")
print(f"Test size after processing: {test_processed.shape}")

# Check remaining missing values
remaining_missing_train = train_processed.isnull().sum().sum()
remaining_missing_test = test_processed.isnull().sum().sum()

print(f"\nRemaining missing values:")
print(f"Train: {remaining_missing_train}")
print(f"Test: {remaining_missing_test}")


## 4. Encode Categorical Variables


In [None]:
# Check categorical variables
print("Categorical variables analysis:")

categorical_cols = train_processed.select_dtypes(include=['object']).columns.tolist()
print(f"Found {len(categorical_cols)} categorical columns:")
for col in categorical_cols:
    unique_count = train_processed[col].nunique()
    print(f"  - {col}: {unique_count} unique values")

# Function to encode categorical variables
def encode_categorical_variables(df_train, df_test):
    """
    Encode categorical variables using LabelEncoder
    """
    df_train_encoded = df_train.copy()
    df_test_encoded = df_test.copy()
    
    categorical_cols = df_train_encoded.select_dtypes(include=['object']).columns.tolist()
    label_encoders = {}
    
    print(f"\nEncoding {len(categorical_cols)} categorical columns...")
    
    for col in categorical_cols:
        if col in df_train_encoded.columns and col in df_test_encoded.columns:
            # Create LabelEncoder
            le = LabelEncoder()
            
            # Combine unique values from train and test for fitting
            all_values = pd.concat([df_train_encoded[col], df_test_encoded[col]]).astype(str)
            le.fit(all_values)
            
            # Apply encoding
            df_train_encoded[col] = le.transform(df_train_encoded[col].astype(str))
            df_test_encoded[col] = le.transform(df_test_encoded[col].astype(str))
            
            label_encoders[col] = le
            print(f"  Done {col}: {len(le.classes_)} classes")
    
    return df_train_encoded, df_test_encoded, label_encoders

# Apply encoding
train_encoded, test_encoded, encoders = encode_categorical_variables(train_processed, test_processed)

print(f"\nEncoding completed!")
print(f"Train size after encoding: {train_encoded.shape}")
print(f"Test size after encoding: {test_encoded.shape}")


## 5. Scale Numerical Features


In [None]:
# Prepare data for scaling
print("Preparing data for scaling...")

# Exclude ID and target variable from scaling
exclude_cols = ['SK_ID_CURR', 'TARGET']
feature_cols = [col for col in train_encoded.columns if col not in exclude_cols]

print(f"Number of features to scale: {len(feature_cols)}")

# Separate numerical and categorical features
numeric_features = train_encoded[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical features: {len(numeric_features)}")

# Function to scale features
def scale_features(df_train, df_test, feature_cols):
    """
    Scale numerical features
    """
    df_train_scaled = df_train.copy()
    df_test_scaled = df_test.copy()
    
    # Initialize StandardScaler
    scaler = StandardScaler()
    
    # Fit on train data
    df_train_scaled[feature_cols] = scaler.fit_transform(df_train[feature_cols])
    
    # Apply to test data
    df_test_scaled[feature_cols] = scaler.transform(df_test[feature_cols])
    
    return df_train_scaled, df_test_scaled, scaler

# Apply scaling
train_scaled, test_scaled, scaler = scale_features(train_encoded, test_encoded, numeric_features)

print(f"\nScaling completed!")
print(f"Train size after scaling: {train_scaled.shape}")
print(f"Test size after scaling: {test_scaled.shape}")

# Check statistics after scaling
print(f"\nStatistics after scaling (first 5 numerical features):")
print(train_scaled[numeric_features[:5]].describe())


## 6. Save Processed Data


In [None]:
# Save processed data
print("Saving processed data...")

# Create directory for processed data
os.makedirs('../data_processed', exist_ok=True)

# Save train data
train_scaled.to_csv('../data_processed/train_processed.csv', index=False)
print("train_processed.csv saved")

# Save test data
test_scaled.to_csv('../data_processed/test_processed.csv', index=False)
print("test_processed.csv saved")

# Save feature information
feature_info = {
    'total_features': len(feature_cols),
    'numeric_features': len(numeric_features),
    'categorical_features': len(categorical_cols),
    'feature_names': feature_cols,
    'numeric_feature_names': numeric_features,
    'categorical_feature_names': categorical_cols
}

import json
with open('../data_processed/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print("feature_info.json saved")

print(f"\nFinal statistics:")
print(f"Original train size: {train_df.shape}")
print(f"Processed train size: {train_scaled.shape}")
print(f"Original test size: {test_df.shape}")
print(f"Processed test size: {test_scaled.shape}")
print(f"Number of features: {len(feature_cols)}")
print(f"Target variable (default): {train_scaled['TARGET'].mean()*100:.2f}%")

print(f"\nData preprocessing completed successfully!")
print(f"Processed data saved in: ../data_processed/")
