# Cleaning the Dataset

This notebook continues from the initial data preparation process and performs systematic cleaning, feature engineering, and scaling to ready the dataset for use in predictive modeling pipelines. It includes standardization of categorical fields, correction of numeric anomalies, creation of derived features, and temporal splitting into training, testing, and explanation subsets. The cleaned outputs are optimized for graph- and tabular-based credit risk models.

Note: This script is intended for academic reference only.

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from datetime import datetime
from dateutil.relativedelta import relativedelta

import os

def clean_and_engineer_features(dataframe: pd.DataFrame) -> pd.DataFrame:

    print("\nStarting cleaning and feature engineering")
    df = dataframe.copy()
    print(f"Initial DataFrame shape: {df.shape}")

    cols_to_drop = [
        'dt_matr', 'cd_msa', 'cltv', 'st',
        'servicer_name', 'prod_type', 'ppmt_pnlty'
    ]
    existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
    if existing_cols_to_drop:
        df.drop(existing_cols_to_drop, axis=1, inplace=True)
        print(f"Dropped columns: {', '.join(existing_cols_to_drop)}")

    print("Creating 'dt_orig' from 'year' and 'month' for temporal splitting.")
    if 'year' in df.columns and 'month' in df.columns:
        df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(0).astype(int)
        df['month'] = pd.to_numeric(df['month'], errors='coerce').fillna(0).astype(int)

        initial_rows = len(df)
        df = df[(df['year'] >= 1900) & (df['year'] <= datetime.now().year + 1) &
                (df['month'] >= 1) & (df['month'] <= 12)].copy()
        if len(df) < initial_rows:
            print(f"Removed {initial_rows - len(df)} rows with invalid year/month combinations for 'dt_orig'.")

        df['dt_orig'] = df['year'].astype(str) + df['month'].astype(str).str.zfill(2)
        print(f"'dt_orig' created. Sample values: {df['dt_orig'].head().tolist()}")
    else:
        print("'year' or 'month' columns not found. Cannot create 'dt_orig'.")

    if 'dt_first_pi' in df.columns:
        df.drop(columns=['dt_first_pi'], inplace=True)
        print("Dropped original 'dt_first_pi' column as 'dt_orig' is now created.")

    print("Processing categorical features to binary/renaming:")
    if 'flag_fthb' in df.columns:
        print(f"Original flag_fthb values: {df['flag_fthb'].unique()}")
        df['if_fthb'] = df['flag_fthb'].replace({'Y': 1, 'N': 0, 'Other': 0}).fillna(0).astype(int)
        df.drop(columns=['flag_fthb'], inplace=True)
        print(f"'flag_fthb' -> 'if_fthb': {df['if_fthb'].value_counts().to_dict()}")

    if 'occpy_sts' in df.columns:
        print(f"Original occpy_sts values: {df['occpy_sts'].unique()}")
        df['if_prim_res'] = df['occpy_sts'].replace({'P': 1, 'S': 0, 'I': 0, 'U': 0}).fillna(0).astype(int)
        df.drop(columns=['occpy_sts'], inplace=True)
        print(f"'occpy_sts' -> 'if_prim_res': {df['if_prim_res'].value_counts().to_dict()}")

    if 'channel' in df.columns:
        df['if_corr'] = df['channel'].replace({'C': 1, 'R': 0, 'B': 0, 'T': 0}).fillna(0).astype(int)
        df.drop(columns=['channel'], inplace=True)
        print(f"'channel' -> 'if_corr': {df['if_corr'].value_counts().to_dict()}")

    if 'prop_type' in df.columns:
        df['if_sf'] = df['prop_type'].replace({'SF': 1, 'CO': 0, 'PU': 0, 'MH': 0, 'CP': 0}).fillna(0).astype(int)
        df.drop(columns=['prop_type'], inplace=True)
        print(f"'prop_type' -> 'if_sf': {df['if_sf'].value_counts().to_dict()}")

    if 'loan_purpose' in df.columns:
        df['if_purc'] = df['loan_purpose'].replace({'P': 1, 'N': 0, 'C': 0}).fillna(0).astype(int)
        df.drop(columns=['loan_purpose'], inplace=True)
        print(f"'loan_purpose' -> 'if_purc': {df['if_purc'].value_counts().to_dict()}")

    if 'flag_sc' in df.columns:
        df['if_sc'] = df['flag_sc'].replace({'Y': 1, 'N': 0}).fillna(0).astype(int)
        df.drop(columns=['flag_sc'], inplace=True)
        print(f"'flag_sc' -> 'if_sc': {df['if_sc'].value_counts().to_dict()}")

    if 'loan_defaulted' in df.columns:
        df['default'] = df['loan_defaulted'].fillna(0).astype(int)
        df.drop(columns=['loan_defaulted'], inplace=True)
        print(f"'loan_defaulted' -> 'default': {df['default'].value_counts().to_dict()}")

    print("  - Processing numerical features:")
    numerical_features = [
        'fico', 'mi_pct', 'cnt_units', 'dti', 'ltv',
        'cnt_borr', 'orig_upb', 'current_upb'
    ]

    for col in [f for f in numerical_features if f in df.columns]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        if col == 'orig_upb' and (df[col] < 0).any():
            neg_count = (df[col] < 0).sum()
            median_val = df.loc[df[col] >= 0, col].median()
            df.loc[df[col] < 0, col] = median_val if not pd.isna(median_val) else 0
            print(f"Corrected {neg_count} negative values in '{col}'")
        fill_val = df[col].median() if col not in ['fico', 'cnt_units', 'cnt_borr'] else df[col].mode()[0]
        df[col] = df[col].fillna(fill_val)
        if col in ['fico', 'cnt_units', 'cnt_borr']:
            df[col] = df[col].round().astype(int)
        print(f"Processed '{col}': min={df[col].min():.2f}, max={df[col].max():.2f}")

    if 'zipcode' in df.columns:
        df['zipcode'] = (
            df['zipcode']
            .astype(str)
            .str.extract(r'(\d{5})')[0]
            .fillna('00000')
        )
        df['area'] = df['zipcode'].str[:2].replace('', '00').fillna('00')
        df.drop('zipcode', axis=1, inplace=True)
        print(f"Created 'area' with values: {sorted(df['area'].unique())}")

    rename_map = {
        'seller_name': 'provider',
        'orig_loan_term': 'loan_term'
    }
    df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True)

    if 'id' not in df.columns:
        df['id'] = np.arange(1, len(df)+1)

    if 'id_loan' in df.columns:
        df['id_loan'] = df['id_loan'].astype(str)

    df = df.loc[:, ~df.columns.duplicated()]

    print(f"Final DataFrame shape after cleaning: {df.shape}")
    print("### Finished cleaning and feature engineering ###")
    return df

def apply_minmax_scaling(df_train: pd.DataFrame, df_test: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):

    print("\nStarting MinMax Scaling for Main Data")

    scaler = MinMaxScaler()

    unscaled_cols = [
        'id', 'id_loan', 'year', 'month', 'provider',
        'area', 'default'
    ]
    if 'd_timer' in df_train.columns:
        unscaled_cols.append('d_timer')

    print("Processing training data:")
    df_train_processed = df_train.copy()

    numeric_cols = [
        col for col in df_train_processed.columns
        if col not in unscaled_cols
        and pd.api.types.is_numeric_dtype(df_train_processed[col])
    ]
    print(f"Numeric columns identified for scaling (Training): {numeric_cols}")

    features_to_scale_train = df_train_processed[numeric_cols].copy()
    for col in numeric_cols:
        features_to_scale_train[col] = pd.to_numeric(
            features_to_scale_train[col],
            errors='coerce'
        ).fillna(0)

    non_numeric_check = features_to_scale_train.applymap(lambda x: isinstance(x, str)).any().any()
    if non_numeric_check:
        raise ValueError("Non-numeric values (strings) still present after conversion in features_to_scale_train. Check `fillna` logic.")

    try:
        scaled_features_train = pd.DataFrame(
            scaler.fit_transform(features_to_scale_train),
            columns=features_to_scale_train.columns,
            index=df_train_processed.index
        )
    except ValueError as e:
        print("Error during scaling. Problematic columns in training data:")
        for col in numeric_cols:
            unique_vals = df_train_processed[col].dropna().unique()
            if any(isinstance(x, str) for x in unique_vals):
                print(f"- {col}: Contains string values - {unique_vals}")
        raise e

    print("Processing testing data:")
    df_test_processed = df_test.copy()
    scaled_features_test = pd.DataFrame()

    if not df_test_processed.empty:
        numeric_cols_test = [col for col in numeric_cols if col in df_test_processed.columns]
        features_to_scale_test = df_test_processed[numeric_cols_test].copy()
        for col in numeric_cols_test:
            features_to_scale_test[col] = pd.to_numeric(
                features_to_scale_test[col],
                errors='coerce'
            ).fillna(0)

        scaled_features_test = pd.DataFrame(
            scaler.transform(features_to_scale_test),
            columns=features_to_scale_test.columns,
            index=df_test_processed.index
        )
        scaled_features_test = np.clip(scaled_features_test, 0, 1)

    unscaled_metadata_train = df_train_processed[[col for col in unscaled_cols if col in df_train_processed.columns]]
    unscaled_metadata_test = df_test_processed[[col for col in unscaled_cols if col in df_test_processed.columns]]

    df_train_scaled = pd.concat([unscaled_metadata_train, scaled_features_train], axis=1)
    df_test_scaled = pd.concat([unscaled_metadata_test, scaled_features_test], axis=1)

    print("### Finished MinMax Scaling for Main Data ###")
    return df_train_scaled, df_test_scaled

def handle_missing_values(df, missing_threshold=0.8):

    print("\nHandling missing values")
    n = len(df.index)
    high_missing_cols = []
    for column in df.columns:
        null_count = df[column].isnull().sum()
        null_percentage = null_count / n
        if null_percentage > missing_threshold:
            high_missing_cols.append(column)
            
    if high_missing_cols:
        df = df.drop(columns=high_missing_cols)
        print(f"Dropped {len(high_missing_cols)} columns with >{missing_threshold*100}% missing values: {high_missing_cols}")
        
    for column in df.columns:
        null_count = df[column].isnull().sum()
        
        if null_count > 0:
            
            if pd.api.types.is_numeric_dtype(df[column]):
                df[column] = df[column].fillna(df[column].median())
                print(f"Filled {null_count} missing values in '{column}' with median")
            else:
                mode_val = df[column].mode()
                
                if not mode_val.empty:
                    df[column] = df[column].fillna(mode_val[0])
                    print(f"Filled {null_count} missing values in '{column}' with mode")
                else:
                    df[column] = df[column].fillna('missing_category')
                    print(f"Filled {null_count} missing values in '{column}' with 'missing_category'")
                    
    print("### Finished handling missing values ###")
    return df

if __name__ == "__main__":
    
    pd.set_option('future.no_silent_downcasting', True)
    PREPARED_DATA_DIR = '../data/prepared_data'
    OUTPUT_CLEANED_DATA_DIR = '../data/cleaned_data'

    if not os.path.exists(OUTPUT_CLEANED_DATA_DIR):
        os.makedirs(OUTPUT_CLEANED_DATA_DIR)
        print(f"Created output directory for cleaned data: {OUTPUT_CLEANED_DATA_DIR}")

    train_main_file_path = os.path.join(PREPARED_DATA_DIR, 'prepared_2015_full_year_data.csv')
    test_main_file_path = os.path.join(PREPARED_DATA_DIR, 'prepared_2016_full_year_data.csv')

    df_train_main_raw = pd.DataFrame()
    df_test_main_raw = pd.DataFrame()

    print("\n### Loading raw Main Data (Origination + Latest Performance) ###")
    if os.path.exists(train_main_file_path):
        df_train_main_raw = pd.read_csv(train_main_file_path, low_memory=False)
        df_train_main_raw = df_train_main_raw.reset_index(drop=True)
        print(f"Loaded training main data (2015) from {train_main_file_path}. Shape: {df_train_main_raw.shape}")
        if 'svcg_cycle' not in df_train_main_raw.columns:
            print(f"'svcg_cycle' column not found in training main data: {train_main_file_path} ***")
    else:
        print(f"Error: Training main data file (2015) not found at {train_main_file_path}.")

    if os.path.exists(test_main_file_path):
        
        df_test_main_raw = pd.read_csv(test_main_file_path, low_memory=False)
        df_test_main_raw = df_test_main_raw.reset_index(drop=True)
        print(f"Loaded testing main data (2016) from {test_main_file_path}. Shape: {df_test_main_raw.shape}")
        
        if 'svcg_cycle' not in df_test_main_raw.columns:
            print(f"Warning: 'svcg_cycle' column not found in testing main data: {test_main_file_path}")
    else:
        print(f"Error: Testing main data file (2016) not found at {test_main_file_path}.")

    if not df_train_main_raw.empty and not df_test_main_raw.empty:
        
        df_train_main_raw = handle_missing_values(df_train_main_raw)
        df_test_main_raw = handle_missing_values(df_test_main_raw)
        df_train_cleaned_full = clean_and_engineer_features(df_train_main_raw)
        df_test_cleaned_full = clean_and_engineer_features(df_test_main_raw)

        print("\n### Applying temporal train/test/graph/explanation splits for Main Data ###")

        df_train_cleaned_full['dt_orig_dt'] = pd.to_datetime(df_train_cleaned_full['dt_orig'], format='%Y%m', errors='coerce').dt.to_period('M').dt.to_timestamp()
        df_test_cleaned_full['dt_orig_dt'] = pd.to_datetime(df_test_cleaned_full['dt_orig'], format='%Y%m', errors='coerce').dt.to_period('M').dt.to_timestamp()
        df_train_cleaned_full = df_train_cleaned_full.dropna(subset=['dt_orig_dt']).copy()
        df_test_cleaned_full = df_test_cleaned_full.dropna(subset=['dt_orig_dt']).copy()
        train_graph_start_date = datetime(2015, 1, 1)
        train_graph_end_date = datetime(2015, 6, 30)
        train_explanation_start_date = datetime(2015, 7, 1)
        train_explanation_end_date = datetime(2015, 7, 31)
        test_graph_start_date = datetime(2016, 1, 1)
        test_graph_end_date = datetime(2016, 6, 30)
        test_explanation_start_date = datetime(2016, 7, 1)
        test_explanation_end_date = datetime(2016, 7, 31)

        df_train_graph_cleaned = df_train_cleaned_full[
            (df_train_cleaned_full['dt_orig_dt'] >= train_graph_start_date) &
            (df_train_cleaned_full['dt_orig_dt'] <= train_graph_end_date)
        ].copy()
        df_train_graph_cleaned = df_train_graph_cleaned.reset_index(drop=True)
        print(f"Train Graph Data (2015 Jan-Jun) raw shape: {df_train_graph_cleaned.shape}")

        df_test_graph_cleaned = df_test_cleaned_full[
            (df_test_cleaned_full['dt_orig_dt'] >= test_graph_start_date) &
            (df_test_cleaned_full['dt_orig_dt'] <= test_graph_end_date)
        ].copy()
        df_test_graph_cleaned = df_test_graph_cleaned.reset_index(drop=True)
        print(f"Test Graph Data (2016 Jan-Jun) raw shape: {df_test_graph_cleaned.shape}")

        target_test_graph_nodes = 5000
        if len(df_test_graph_cleaned) > target_test_graph_nodes:
            df_test_graph_cleaned = df_test_graph_cleaned.sample(n=target_test_graph_nodes, random_state=42).copy()
            df_test_graph_cleaned = df_test_graph_cleaned.reset_index(drop=True)
            print(f"Downsampled Test Graph Data (2016 Jan-Jun) to {target_test_graph_nodes} nodes.")
        elif len(df_test_graph_cleaned) < target_test_graph_nodes:
            print(f"Test Graph Data (2016 Jan-Jun) has {len(df_test_graph_cleaned)} nodes, which is less than target {target_test_graph_nodes}. Using all available.")

        print(f"### Train Graph Data (2015 Jan-Jun) final shape: {df_train_graph_cleaned.shape} ###")
        print(f"### Test Graph Data (2016 Jan-Jun) final shape: {df_test_graph_cleaned.shape} ###")

        df_train_explanation_cleaned = df_train_cleaned_full[
            (df_train_cleaned_full['dt_orig_dt'] >= train_explanation_start_date) &
            (df_train_cleaned_full['dt_orig_dt'] <= train_explanation_end_date)
        ].copy()
        df_train_explanation_cleaned = df_train_explanation_cleaned.reset_index(drop=True)
        print(f"Train Explanation Data (2015 July) shape: {df_train_explanation_cleaned.shape}")

        df_test_explanation_cleaned = df_test_cleaned_full[
            (df_test_cleaned_full['dt_orig_dt'] >= test_explanation_start_date) &
            (df_test_cleaned_full['dt_orig_dt'] <= test_explanation_end_date)
        ].copy()
        df_test_explanation_cleaned = df_test_explanation_cleaned.reset_index(drop=True)
        print(f"Test Explanation Data (2016 July) shape: {df_test_explanation_cleaned.shape}")

        df_train_graph_cleaned.drop(columns=['dt_orig_dt'], inplace=True)
        df_test_graph_cleaned.drop(columns=['dt_orig_dt'], inplace=True)
        df_train_explanation_cleaned.drop(columns=['dt_orig_dt'], inplace=True)
        df_test_explanation_cleaned.drop(columns=['dt_orig_dt'], inplace=True)

        print("\n### Applying MinMax Scaling to prepared data ###")

        scaler = MinMaxScaler()

        unscaled_cols = [
            'id', 'id_loan', 'year', 'month', 'provider',
            'area', 'default', 'd_timer'
        ]

        numeric_cols_for_scaling = [
            col for col in df_train_graph_cleaned.columns
            if col not in unscaled_cols
            and pd.api.types.is_numeric_dtype(df_train_graph_cleaned[col])
        ]
        print(f"Master numeric columns for scaling: {numeric_cols_for_scaling}")

        train_graph_features = df_train_graph_cleaned[numeric_cols_for_scaling].copy()
        for col in numeric_cols_for_scaling:
            train_graph_features[col] = pd.to_numeric(train_graph_features[col], errors='coerce').fillna(0)
        scaler.fit(train_graph_features)
        print("Scaler fitted on df_train_graph_cleaned (Jan-June 2015 data).")

        def transform_dataframe(df_to_transform, scaler, numeric_cols, unscaled_cols_list):
            
            df_transformed = df_to_transform.copy()
            features_to_scale = df_transformed[numeric_cols].copy()
            
            for col in numeric_cols:
                features_to_scale[col] = pd.to_numeric(features_to_scale[col], errors='coerce').fillna(0)

            scaled_features = pd.DataFrame(
                scaler.transform(features_to_scale),
                columns=features_to_scale.columns,
                index=df_transformed.index
            )
            
            scaled_features = np.clip(scaled_features, 0, 1)
            unscaled_metadata = df_transformed[[col for col in unscaled_cols_list if col in df_transformed.columns]]
            
            return pd.concat([unscaled_metadata, scaled_features], axis=1)

        df_train_graph_scaled = transform_dataframe(df_train_graph_cleaned, scaler, numeric_cols_for_scaling, unscaled_cols)
        df_test_graph_scaled = transform_dataframe(df_test_graph_cleaned, scaler, numeric_cols_for_scaling, unscaled_cols)
        df_train_explanation_scaled = transform_dataframe(df_train_explanation_cleaned, scaler, numeric_cols_for_scaling, unscaled_cols)
        df_test_explanation_scaled = transform_dataframe(df_test_explanation_cleaned, scaler, numeric_cols_for_scaling, unscaled_cols)

        print("\ndtypes from scaled data (before saving)")
        sample_cols_to_check = ['fico', 'mi_pct', 'cnt_units', 'cnt_borr', 'orig_upb', 'current_upb', 'd_timer']
        
        for df_name, df_data in [
            ("df_train_graph_scaled", df_train_graph_scaled),
            ("df_test_graph_scaled", df_test_graph_scaled),
            ("df_train_explanation_scaled", df_train_explanation_scaled),
            ("df_test_explanation_scaled", df_test_explanation_scaled)
        ]:
            present_cols = [col for col in sample_cols_to_check if col in df_data.columns]
            if present_cols:
                print(f"{df_name} dtypes (selected):")
                print(df_data[present_cols].dtypes)
            else:
                print(f"No selected columns found in {df_name} for dtype check.")

        df_train_graph_scaled.to_csv(os.path.join(OUTPUT_CLEANED_DATA_DIR, 'df_origination_train_graph_scaled.csv'), index=False)
        print(f"\nSaved scaled training graph data (Jan-Jun 2015) to {os.path.join(OUTPUT_CLEANED_DATA_DIR, 'df_origination_train_graph_scaled.csv')}")

        df_test_graph_scaled.to_csv(os.path.join(OUTPUT_CLEANED_DATA_DIR, 'df_origination_test_graph_scaled.csv'), index=False)
        print(f"Saved scaled testing graph data (Jan-Jun 2016) to {os.path.join(OUTPUT_CLEANED_DATA_DIR, 'df_origination_test_graph_scaled.csv')}")

        df_train_explanation_scaled.to_csv(os.path.join(OUTPUT_CLEANED_DATA_DIR, 'df_origination_train_explanation_scaled.csv'), index=False)
        print(f"Saved scaled training explanation data (July 2015) to {os.path.join(OUTPUT_CLEANED_DATA_DIR, 'df_origination_train_explanation_scaled.csv')}")

        df_test_explanation_scaled.to_csv(os.path.join(OUTPUT_CLEANED_DATA_DIR, 'df_origination_test_explanation_scaled.csv'), index=False)
        print(f"Saved scaled testing explanation data (July 2016) to {os.path.join(OUTPUT_CLEANED_DATA_DIR, 'df_origination_test_explanation_scaled.csv')}")

    else:
        print("Error: Cannot proceed with cleaning and scaling due to missing input data.")

    print("\n### Data Cleaning and Temporal Splitting complete! ###")