In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import glob
from pathlib import Path
from tqdm import tqdm

import dask.dataframe as dd

# 1. Import Dataset

In [2]:
data_folder = str(os.getcwd()) + "\data"
orig_filename = "/sample_orig_*.txt"


orig_headers = ['CREDIT_SCORE','FIRST_PAYMENT_DATE','FIRST_TIME_HOMEBUYER_FLAG','MATURITY_DATE','MSA','MI_PCT',
                'NUMBER_OF_UNITS','OCCUPANCY_STATUS','CLTV','DTI','ORIGINAL_UPB','LTV','ORIGINAL_INTEREST_RATE',
                'CHANNEL','PPM','AMORTIZATION_TYPE','PROPERTY_STATE', 'PROPERTY_TYPE','POSTAL_CODE',
                'LOAN_SEQUENCE_NUMBER','LOAN_PURPOSE', 'ORIGINAL_LOAN_TERM','NUMBER_OF_BORROWERS','SELLER_NAME',
                'SERVICER_NAME','SUPER_CONFORMING_FLAG','PRE-RELIEF_REFINANCE_LOAN_SEQUENCE_NUMBER', 
                'PROGRAM_INDICATOR', 'RELIEF_REFINANCE_INDICATOR', 'PROPERTY_VALUATION_METHOD', 'IO_INDICATOR']

raw_orig_types = {'CREDIT_SCORE':float, 'FIRST_PAYMENT_DATE':str, 'FIRST_TIME_HOMEBUYER_FLAG':str, 'MATURITY_DATE':str,
              'MSA':str, 'MI_PCT':float, 'NUMBER_OF_UNITS':float, 'OCCUPANCY_STATUS':str, 'CLTV':float, 'DTI':float,
              'ORIGINAL_UPB':float, 'LTV':float, 'ORIGINAL_INTEREST_RATE':float, 'CHANNEL':str, 'PPM': str,
              'AMORTIZATION_TYPE':str ,'PROPERTY_STATE':str, 'PROPERTY_TYPE':str ,'POSTAL_CODE':str,
              'LOAN_SEQUENCE_NUMBER':str, 'LOAN_PURPOSE':str, 'ORIGINAL_LOAN_TERM':float, 'NUMBER_OF_BORROWERS':float,
              'SELLER_NAME': str, 'SERVICER_NAME':str, 'SUPER_CONFORMING_FLAG':str,
              'PRE-RELIEF_REFINANCE_LOAN_SEQUENCE_NUMBER':str, 'PROGRAM_INDICATOR':str, 'RELIEF_REFINANCE_INDICATOR':str,
              'PROPERTY_VALUATION_METHOD':str, 'IO_INDICATOR':str}

mutated_orig_types = raw_orig_types.copy()

orig_dates = ['FIRST_PAYMENT_DATE', 'MATURITY_DATE']

data_folder = str(os.getcwd()) + "\data"
perf_filename = "\sample_svcg_*.txt"


perf_headers = ['LOAN_SEQUENCE_NUMBER','MONTHLY_REPORTING_PERIOD','CURRENT_ACTUAL_UPB',
                'CURRENT_LOAN_DELINQUENCY_STATUS','LOAN_AGE','REMAINING_MONTHS_TO_LEGAL_MATURITY', 
                'DEFECT_SETTLEMENT_DATE','MODIFICATION_FLAG', 'ZERO_BALANCE_CODE', 
                'ZERO_BALANCE_EFFECTIVE_DATE','CURRENT_INTEREST_RATE','CURRENT_DEFERRED_UPB','DDLPI',
                'MI_RECOVERIES', 'NET_SALE_PROCEEDS','NON_MI_RECOVERIES','EXPENSES', 'LEGAL_COSTS',
                'MAINTENANCE_AND_PRESERVATION_COSTS','TAXES_AND_INSURANCE','MISCELLANEOUS_EXPENSES',
                'ACTUAL_LOSS_CALCULATION', 'MODIFICATION_COST','STEP_MODIFICATION_FLAG','DEFERRED_PAYMENT_PLAN',
                'ELTV','ZERO_BALANCE_REMOVAL_UPB','DELINQUENT_ACCRUED_INTEREST','DELINQUENCY_DUE_TO_DISASTER',
                'BORROWER_ASSISTANCE_STATUS_CODE','CURRENT_MONTH_MODIFICATION_COST','INTEREST_BEARING_UPB']

raw_perf_types = {'LOAN_SEQUENCE_NUMBER':str, 'MONTHLY_REPORTING_PERIOD':str, 'CURRENT_ACTUAL_UPB':float,
              'CURRENT_LOAN_DELINQUENCY_STATUS': str, 'LOAN_AGE':float, 'REMAINING_MONTHS_TO_LEGAL_MATURITY':float, 
              'DEFECT_SETTLEMENT_DATE':str, 'MODIFICATION_FLAG':str, 'ZERO_BALANCE_CODE':str,
              'ZERO_BALANCE_EFFECTIVE_DATE':str, 'CURRENT_INTEREST_RATE':float, 'CURRENT_DEFERRED_UPB':float,
              'DDLPI': str,'MI_RECOVERIES':float, 'NET_SALE_PROCEEDS':str ,'NON_MI_RECOVERIES':float,'EXPENSES':float,
              'LEGAL_COSTS':float ,'MAINTENANCE_AND_PRESERVATION_COSTS':float, 'TAXES_AND_INSURANCE':float,
              'MISCELLANEOUS_EXPENSES':float,'ACTUAL_LOSS_CALCULATION':float, 'MODIFICATION_COST':float,
              'STEP_MODIFICATION_FLAG':str, 'DEFERRED_PAYMENT_PLAN':str, 'ELTV':float, 'ZERO_BALANCE_REMOVAL_UPB':float,
              'DELINQUENT_ACCRUED_INTEREST':float, 'DELINQUENCY_DUE_TO_DISASTER':str,'BORROWER_ASSISTANCE_STATUS_CODE':str,
              'CURRENT_MONTH_MODIFICATION_COST':float, 'INTEREST_BEARING_UPB':float}

mutated_perf_types = raw_perf_types.copy()
mutated_perf_types['CURRENT_LOAN_DELINQUENCY_STATUS'] = float



perf_dates = ['DEFECT_SETTLEMENT_DATE', 'ZERO_BALANCE_EFFECTIVE_DATE', 'DDLPI']

In [None]:
from FREDDIEMAC_offline_preprocessing import pipeline_from_raw_data

pipeline_from_raw_data(orig_filename, 
                        perf_filename,
                        orig_headers,
                        perf_headers,
                        raw_orig_types, 
                        raw_perf_types,
                        mutated_orig_types,
                        mutated_perf_types,
                        data_folder)

In [3]:
from FREDDIEMAC_classification import assign_labels_to_orig, assign_labels_to_perf, cutoff_sequence_according_to_label
from FREDDIEMAC_reducing import drop_short_sequences

offline_preprocessed_orig_filename = "/sample_orig_*.parquet.gzip"
offline_preprocessed_perf_filename = "/sample_svcg_*.parquet.gzip"

annual_dataset_paths_orig = glob.glob(data_folder + offline_preprocessed_orig_filename)
annual_dataset_paths_perf = glob.glob(data_folder + offline_preprocessed_perf_filename)

annual_dataset_iterator = tqdm(zip(annual_dataset_paths_orig, annual_dataset_paths_perf))

for path_orig, path_perf  in annual_dataset_iterator:
    annual_dataset_iterator.set_description("Working on %s and %s" % (Path(path_orig).stem, Path(path_perf).stem))

    annual_df_orig = pd.read_parquet(path_orig)
    annual_df_orig = annual_df_orig.astype(mutated_orig_types)

    annual_df_perf = pd.read_parquet(path_perf)
    annual_df_perf = annual_df_perf.astype(mutated_perf_types)

    annual_df_orig = assign_labels_to_orig(annual_df_orig, annual_df_perf)
    annual_df_perf = assign_labels_to_perf(annual_df_orig, annual_df_perf)
    annual_df_orig, annual_df_perf = cutoff_sequence_according_to_label(annual_df_orig, annual_df_perf)
    annual_df_orig, annual_df_perf = drop_short_sequences(annual_df_orig, annual_df_perf, 12)

    annual_df_orig.to_parquet(Path(data_folder) / ("labeled_" + str(Path(path_orig).stem) + ".gzip"), compression="gzip")
    annual_df_perf.to_parquet(Path(data_folder) / ("labeled_" + str(Path(path_perf).stem) + ".gzip"), compression="gzip")


0it [00:00, ?it/s]


### 1.1 Data Exploration

In [4]:
df_orig = dd.read_parquet(data_folder + "/labeled_sample_orig_*.parquet.gzip")
df_perf = dd.read_parquet(data_folder + "/labeled_sample_svcg_*.parquet.gzip")

In [5]:
print(df_perf["ORIGINAL_LOAN_SEQUENCE_NUMBER"].nunique().compute())
print(df_orig["ORIGINAL_LOAN_SEQUENCE_NUMBER"].nunique().compute())

KeyError: 'ORIGINAL_LOAN_SEQUENCE_NUMBER'

# 2. Aggregating data like Blumenstock et al. 2020 experiment 4.1

In [None]:
from FREDDIEMAC_reducing import reduce_length_of_sequence
from FREDDIEMAC_aggregation import aggregate_to_blumenstock_exp4

labeled_orig_filename = "/labeled_sample_orig_*.parquet.gzip"
labeled_perf_filename = "/labeled_sample_svcg_*.parquet.gzip"

annual_dataset_paths_orig = glob.glob(data_folder + labeled_orig_filename)
annual_dataset_paths_perf = glob.glob(data_folder + labeled_perf_filename)

annual_dataset_iterator = tqdm(zip(annual_dataset_paths_orig, annual_dataset_paths_perf))

for path_orig, path_perf  in annual_dataset_iterator:
    annual_dataset_iterator.set_description("Working on %s and %s" % (Path(path_orig).stem, Path(path_perf).stem))

    annual_df_orig = pd.read_parquet(path_orig)
    annual_df_orig = annual_df_orig.astype(mutated_orig_types)

    annual_df_perf = pd.read_parquet(path_perf)
    annual_df_perf = annual_df_perf.astype(mutated_perf_types)

    shortened_annual_df_perf = reduce_length_of_sequence(annual_df_perf, length=12)
    df_blumenstock = aggregate_to_blumenstock_exp4(annual_df_orig, shortened_annual_df_perf)

    df_blumenstock.to_parquet(Path(data_folder) / ("blumenstock_" + str(Path(path_orig).stem) + ".gzip"), compression="gzip")

In [None]:
blumenstock_types = {'LOAN_SEQUENCE_NUMBER': str, 'INT_RATE': float, 'ORIG_UPB': float, 'FICO_SCORE': float,
                    'DTI_R': float, 'LTV_R': float, 'FIRST_PAYMENT_DATE': str, 'BAL_REPAID': float, 'T_ACT_12M': float, 'T_DEL_30D': float, 
                    'T_DEL_60D': float, 'LABEL': str, 'REMAINING_MONTHS_TO_LEGAL_MATURITY': float, "TIME_TO_EVENT": float, 'TOTAL_OBSERVED_LENGTH': float}


df_blumenstock = dd.read_parquet(data_folder + "/blumenstock_labeled_sample_orig_*.parquet.gzip")
df_blumenstock = df_blumenstock.astype(blumenstock_types)

# 3. Expanding Blumenstock et al. 2020 experiment 4.1 to dynamic data

In [None]:
from FREDDIEMAC_aggregation import aggregate_to_blumenstock_exp4_dynamic

labeled_orig_filename = "/labeled_sample_orig_*.parquet.gzip"
labeled_perf_filename = "/labeled_sample_svcg_*.parquet.gzip"

annual_dataset_paths_orig = glob.glob(data_folder + labeled_orig_filename)
annual_dataset_paths_perf = glob.glob(data_folder + labeled_perf_filename)

annual_dataset_iterator = tqdm(zip(annual_dataset_paths_orig, annual_dataset_paths_perf))

for path_orig, path_perf  in annual_dataset_iterator:
    annual_dataset_iterator.set_description("Working on %s and %s" % (Path(path_orig).stem, Path(path_perf).stem))

    annual_df_orig = pd.read_parquet(path_orig)
    annual_df_orig = annual_df_orig.astype(mutated_orig_types)

    annual_df_perf = pd.read_parquet(path_perf)
    annual_df_perf = annual_df_perf.astype(mutated_perf_types)

    df_blumenstock_dynamic = aggregate_to_blumenstock_exp4_dynamic(annual_df_orig, annual_df_perf)

    df_blumenstock_dynamic.to_parquet(Path(data_folder) / ("blumenstock_dynamic_" + str(Path(path_orig).stem) + ".gzip"), compression="gzip")

In [None]:
blumenstock_dynamic_types = {'LOAN_SEQUENCE_NUMBER': str, 'MONTHLY_REPORTING_PERIOD': str,'CURRENT_ACTUAL_UPB': float, 'CURRENT_LOAN_DELINQUENCY_STATUS': float, 
                    'CURRENT_INTEREST_RATE':float,'ELTV': float ,'LOAN_AGE': float, 'REMAINING_MONTHS_TO_LEGAL_MATURITY': float, 'CREDIT_SCORE': float,
                    'DTI': float, 'LTV': float, 'BAL_REPAID': float, 
                    'LABEL': str, "TIME_TO_EVENT": float, 'ORIGINAL_INTEREST_RATE': float, 'ORIGINAL_UPB': float, 'TOTAL_OBSERVED_LENGTH': float}

df_blumenstock_dynamic = dd.read_parquet(data_folder + "/blumenstock_dynamic_labeled_sample_orig_*.parquet.gzip")
df_blumenstock_dynamic = df_blumenstock_dynamic.astype(blumenstock_dynamic_types)

In [None]:
df_blumenstock_dynamic.head(10)