In [1]:
%load_ext autoreload
%autoreload 2

import os
import glob
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import dask.dataframe as dd

# 1. Import Dataset

In [2]:
data_folder = str(os.getcwd()) + "\data"
orig_filename = "/sample_orig_*.txt"


orig_headers = ['CREDIT_SCORE','FIRST_PAYMENT_DATE','FIRST_TIME_HOMEBUYER_FLAG','MATURITY_DATE','MSA','MI_PCT',
                'NUMBER_OF_UNITS','OCCUPANCY_STATUS','CLTV','DTI','ORIGINAL_UPB','LTV','ORIGINAL_INTEREST_RATE',
                'CHANNEL','PPM','AMORTIZATION_TYPE','PROPERTY_STATE', 'PROPERTY_TYPE','POSTAL_CODE',
                'LOAN_SEQUENCE_NUMBER','LOAN_PURPOSE', 'ORIGINAL_LOAN_TERM','NUMBER_OF_BORROWERS','SELLER_NAME',
                'SERVICER_NAME','SUPER_CONFORMING_FLAG','PRE-RELIEF_REFINANCE_LOAN_SEQUENCE_NUMBER', 
                'PROGRAM_INDICATOR', 'RELIEF_REFINANCE_INDICATOR', 'PROPERTY_VALUATION_METHOD', 'IO_INDICATOR']

raw_orig_types = {'CREDIT_SCORE':float, 'FIRST_PAYMENT_DATE':str, 'FIRST_TIME_HOMEBUYER_FLAG':str, 'MATURITY_DATE':str,
              'MSA':str, 'MI_PCT':float, 'NUMBER_OF_UNITS':float, 'OCCUPANCY_STATUS':str, 'CLTV':float, 'DTI':float,
              'ORIGINAL_UPB':float, 'LTV':float, 'ORIGINAL_INTEREST_RATE':float, 'CHANNEL':str, 'PPM': str,
              'AMORTIZATION_TYPE':str ,'PROPERTY_STATE':str, 'PROPERTY_TYPE':str ,'POSTAL_CODE':str,
              'LOAN_SEQUENCE_NUMBER':str, 'LOAN_PURPOSE':str, 'ORIGINAL_LOAN_TERM':float, 'NUMBER_OF_BORROWERS':float,
              'SELLER_NAME': str, 'SERVICER_NAME':str, 'SUPER_CONFORMING_FLAG':str,
              'PRE-RELIEF_REFINANCE_LOAN_SEQUENCE_NUMBER':str, 'PROGRAM_INDICATOR':str, 'RELIEF_REFINANCE_INDICATOR':str,
              'PROPERTY_VALUATION_METHOD':str, 'IO_INDICATOR':str}

mutated_orig_types = raw_orig_types.copy()

orig_dates = ['FIRST_PAYMENT_DATE', 'MATURITY_DATE']


perf_filename = "\sample_svcg_*.txt"


perf_headers = ['LOAN_SEQUENCE_NUMBER','MONTHLY_REPORTING_PERIOD','CURRENT_ACTUAL_UPB',
                'CURRENT_LOAN_DELINQUENCY_STATUS','LOAN_AGE','REMAINING_MONTHS_TO_LEGAL_MATURITY', 
                'DEFECT_SETTLEMENT_DATE','MODIFICATION_FLAG', 'ZERO_BALANCE_CODE', 
                'ZERO_BALANCE_EFFECTIVE_DATE','CURRENT_INTEREST_RATE','CURRENT_DEFERRED_UPB','DDLPI',
                'MI_RECOVERIES', 'NET_SALE_PROCEEDS','NON_MI_RECOVERIES','EXPENSES', 'LEGAL_COSTS',
                'MAINTENANCE_AND_PRESERVATION_COSTS','TAXES_AND_INSURANCE','MISCELLANEOUS_EXPENSES',
                'ACTUAL_LOSS_CALCULATION', 'MODIFICATION_COST','STEP_MODIFICATION_FLAG','DEFERRED_PAYMENT_PLAN',
                'ELTV','ZERO_BALANCE_REMOVAL_UPB','DELINQUENT_ACCRUED_INTEREST','DELINQUENCY_DUE_TO_DISASTER',
                'BORROWER_ASSISTANCE_STATUS_CODE','CURRENT_MONTH_MODIFICATION_COST','INTEREST_BEARING_UPB']

raw_perf_types = {'LOAN_SEQUENCE_NUMBER':str, 'MONTHLY_REPORTING_PERIOD':str, 'CURRENT_ACTUAL_UPB':float,
              'CURRENT_LOAN_DELINQUENCY_STATUS': str, 'LOAN_AGE':float, 'REMAINING_MONTHS_TO_LEGAL_MATURITY':float, 
              'DEFECT_SETTLEMENT_DATE':str, 'MODIFICATION_FLAG':str, 'ZERO_BALANCE_CODE':str,
              'ZERO_BALANCE_EFFECTIVE_DATE':str, 'CURRENT_INTEREST_RATE':float, 'CURRENT_DEFERRED_UPB':float,
              'DDLPI': str,'MI_RECOVERIES':float, 'NET_SALE_PROCEEDS':str ,'NON_MI_RECOVERIES':float,'EXPENSES':float,
              'LEGAL_COSTS':float ,'MAINTENANCE_AND_PRESERVATION_COSTS':float, 'TAXES_AND_INSURANCE':float,
              'MISCELLANEOUS_EXPENSES':float,'ACTUAL_LOSS_CALCULATION':float, 'MODIFICATION_COST':float,
              'STEP_MODIFICATION_FLAG':str, 'DEFERRED_PAYMENT_PLAN':str, 'ELTV':float, 'ZERO_BALANCE_REMOVAL_UPB':float,
              'DELINQUENT_ACCRUED_INTEREST':float, 'DELINQUENCY_DUE_TO_DISASTER':str,'BORROWER_ASSISTANCE_STATUS_CODE':str,
              'CURRENT_MONTH_MODIFICATION_COST':float, 'INTEREST_BEARING_UPB':float}

mutated_perf_types = raw_perf_types.copy()
mutated_perf_types['CURRENT_LOAN_DELINQUENCY_STATUS'] = float

perf_dates = ['DEFECT_SETTLEMENT_DATE', 'ZERO_BALANCE_EFFECTIVE_DATE', 'DDLPI']

In [3]:
from FREDDIEMAC_offline_preprocessing import pipeline_from_raw_data

pipeline_from_raw_data(orig_filename, 
                        perf_filename,
                        orig_headers,
                        perf_headers,
                        raw_orig_types, 
                        raw_perf_types,
                        mutated_orig_types,
                        mutated_perf_types,
                        data_folder)

Working on sample_orig_2011 and sample_svcg_2011: : 2it [01:40, 50.08s/it]


In [4]:
df_orig = dd.read_parquet(data_folder + "/sample_orig_*.parquet.gzip")
df_perf = dd.read_parquet(data_folder + "/sample_svcg_*.parquet.gzip")

In [5]:
print(df_perf["LOAN_SEQUENCE_NUMBER"].nunique().compute())
print(df_orig["LOAN_SEQUENCE_NUMBER"].nunique().compute())

99997
100000


In [6]:
df_orig["ORIGINAL_LOAN_TERM"].value_counts().compute()

360.0    61290
180.0    26672
240.0     7881
120.0     2587
300.0      784
         ...  
242.0        1
256.0        1
261.0        1
262.0        1
266.0        1
Name: ORIGINAL_LOAN_TERM, Length: 141, dtype: int64

In [7]:
from FREDDIEMAC_classification import assign_labels_to_orig, assign_labels_to_perf, cutoff_sequence_according_to_label
from FREDDIEMAC_reducing import drop_short_sequences, select_specific_original_loan_term

offline_preprocessed_orig_filename = "/sample_orig_*.parquet.gzip"
offline_preprocessed_perf_filename = "/sample_svcg_*.parquet.gzip"

annual_dataset_paths_orig = glob.glob(data_folder + offline_preprocessed_orig_filename)
annual_dataset_paths_perf = glob.glob(data_folder + offline_preprocessed_perf_filename)

annual_dataset_iterator = tqdm(zip(annual_dataset_paths_orig, annual_dataset_paths_perf))

for path_orig, path_perf  in annual_dataset_iterator:
    annual_dataset_iterator.set_description("Working on %s and %s" % (Path(path_orig).stem, Path(path_perf).stem))

    annual_df_orig = pd.read_parquet(path_orig)
    annual_df_orig = annual_df_orig.astype(mutated_orig_types)

    annual_df_perf = pd.read_parquet(path_perf)
    annual_df_perf = annual_df_perf.astype(mutated_perf_types)

    annual_df_orig, annual_df_perf = select_specific_original_loan_term(annual_df_orig, annual_df_perf, loan_terms_list=[180])

    annual_df_orig = assign_labels_to_orig(annual_df_orig, annual_df_perf)
    annual_df_perf = assign_labels_to_perf(annual_df_orig, annual_df_perf)
    annual_df_orig, annual_df_perf = cutoff_sequence_according_to_label(annual_df_orig, annual_df_perf)
    annual_df_orig, annual_df_perf = drop_short_sequences(annual_df_orig, annual_df_perf, 12)

    annual_df_orig.to_parquet(Path(data_folder) / ("labeled_" + str(Path(path_orig).stem) + ".gzip"), compression="gzip")
    annual_df_perf.to_parquet(Path(data_folder) / ("labeled_" + str(Path(path_perf).stem) + ".gzip"), compression="gzip")


Working on sample_orig_2011.parquet and sample_svcg_2011.parquet: : 2it [02:45, 82.93s/it]


### 1.1 Data Exploration

In [8]:
df_orig = dd.read_parquet(data_folder + "/labeled_sample_orig_*.parquet.gzip")
df_perf = dd.read_parquet(data_folder + "/labeled_sample_svcg_*.parquet.gzip")

In [9]:
print(df_perf["LOAN_SEQUENCE_NUMBER"].nunique().compute())
print(df_orig["LOAN_SEQUENCE_NUMBER"].nunique().compute())

24614
24615


In [10]:
df_orig["ORIGINAL_LOAN_TERM"].value_counts().compute()

180.0    24615
Name: ORIGINAL_LOAN_TERM, dtype: int64

In [11]:
print(df_orig["TOTAL_OBSERVED_LENGTH"].min().compute())
print(df_orig["TOTAL_OBSERVED_LENGTH"].max().compute())

12.0
143.0


# 2. Aggregating data like Blumenstock et al. 2020 experiment 4.1

In [12]:
from FREDDIEMAC_reducing import reduce_length_of_sequence
from FREDDIEMAC_aggregation import aggregate_to_blumenstock_exp4

labeled_orig_filename = "/labeled_sample_orig_*.parquet.gzip"
labeled_perf_filename = "/labeled_sample_svcg_*.parquet.gzip"

annual_dataset_paths_orig = glob.glob(data_folder + labeled_orig_filename)
annual_dataset_paths_perf = glob.glob(data_folder + labeled_perf_filename)

annual_dataset_iterator = tqdm(zip(annual_dataset_paths_orig, annual_dataset_paths_perf))

for path_orig, path_perf  in annual_dataset_iterator:
    annual_dataset_iterator.set_description("Working on %s and %s" % (Path(path_orig).stem, Path(path_perf).stem))

    annual_df_orig = pd.read_parquet(path_orig)
    annual_df_orig = annual_df_orig.astype(mutated_orig_types)

    annual_df_perf = pd.read_parquet(path_perf)
    annual_df_perf = annual_df_perf.astype(mutated_perf_types)

    shortened_annual_df_perf = reduce_length_of_sequence(annual_df_perf, length=12)
    df_blumenstock = aggregate_to_blumenstock_exp4(annual_df_orig, shortened_annual_df_perf)

    df_blumenstock.to_parquet(Path(data_folder) / ("blumenstock_" + str(Path(path_orig).stem) + ".gzip"), compression="gzip")

Working on labeled_sample_orig_2011.parquet and labeled_sample_svcg_2011.parquet: : 2it [01:18, 39.46s/it]


In [13]:
blumenstock_types = {'LOAN_SEQUENCE_NUMBER': str, 'INT_RATE': float, 'ORIG_UPB': float, 'FICO_SCORE': float,
                    'DTI_R': float, 'LTV_R': float, 'FIRST_PAYMENT_DATE': str, 'BAL_REPAID': float, 'T_ACT_12M': float, 'T_DEL_30D': float, 
                    'T_DEL_60D': float, 'LABEL': str, 'REMAINING_MONTHS_TO_LEGAL_MATURITY': float, "TIME_TO_EVENT": float, 'TOTAL_OBSERVED_LENGTH': float}


df_blumenstock = dd.read_parquet(data_folder + "/blumenstock_labeled_sample_orig_*.parquet.gzip")
df_blumenstock = df_blumenstock.astype(blumenstock_types)

In [14]:
df_blumenstock.head(20)

Unnamed: 0,LOAN_SEQUENCE_NUMBER,INT_RATE,ORIG_UPB,FICO_SCORE,DTI_R,LTV_R,FIRST_PAYMENT_DATE,REMAINING_MONTHS_TO_LEGAL_MATURITY,TOTAL_OBSERVED_LENGTH,TIME_TO_EVENT,LABEL,BAL_REPAID,T_ACT_12M,T_DEL_30D,T_DEL_60D
0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,201005,156.0,25.0,25.0,0,0.901226,12.0,0.0,0.0
1,F10Q10000069,4.5,200000.0,795.0,35.0,67.0,201003,156.0,25.0,25.0,0,0.836209,12.0,0.0,0.0
2,F10Q10000089,4.5,146000.0,784.0,47.0,55.0,201003,120.0,61.0,61.0,0,0.724131,12.0,0.0,0.0
3,F10Q10000115,4.5,68000.0,794.0,30.0,55.0,201003,115.0,66.0,66.0,0,0.71144,12.0,0.0,0.0
4,F10Q10000302,4.375,250000.0,726.0,55.0,60.0,201003,143.0,38.0,38.0,0,0.844081,12.0,0.0,0.0
5,F10Q10000332,4.75,70000.0,760.0,24.0,42.0,201003,62.0,119.0,119.0,0,0.073537,12.0,0.0,0.0
6,F10Q10000658,4.0,361000.0,755.0,44.0,57.0,201003,145.0,36.0,36.0,0,0.847537,12.0,0.0,0.0
7,F10Q10000681,4.375,124000.0,719.0,45.0,80.0,201003,67.0,114.0,114.0,0,0.344512,12.0,0.0,0.0
8,F10Q10000913,4.375,165000.0,721.0,35.0,59.0,201003,98.0,83.0,83.0,0,0.596179,12.0,0.0,0.0
9,F10Q10000944,4.375,142000.0,792.0,10.0,32.0,201003,113.0,68.0,68.0,0,0.677846,12.0,0.0,0.0


# 3. Expanding Blumenstock et al. 2020 experiment 4.1 to dynamic data

In [15]:
from FREDDIEMAC_aggregation import aggregate_to_blumenstock_exp4_dynamic

labeled_orig_filename = "/labeled_sample_orig_*.parquet.gzip"
labeled_perf_filename = "/labeled_sample_svcg_*.parquet.gzip"

annual_dataset_paths_orig = glob.glob(data_folder + labeled_orig_filename)
annual_dataset_paths_perf = glob.glob(data_folder + labeled_perf_filename)

annual_dataset_iterator = tqdm(zip(annual_dataset_paths_orig, annual_dataset_paths_perf))

for path_orig, path_perf  in annual_dataset_iterator:
    annual_dataset_iterator.set_description("Working on %s and %s" % (Path(path_orig).stem, Path(path_perf).stem))

    annual_df_orig = pd.read_parquet(path_orig)
    annual_df_orig = annual_df_orig.astype(mutated_orig_types)

    annual_df_perf = pd.read_parquet(path_perf)
    annual_df_perf = annual_df_perf.astype(mutated_perf_types)

    df_blumenstock_dynamic = aggregate_to_blumenstock_exp4_dynamic(annual_df_orig, annual_df_perf)

    df_blumenstock_dynamic.to_parquet(Path(data_folder) / ("blumenstock_dynamic_" + str(Path(path_orig).stem) + ".gzip"), compression="gzip")

Working on labeled_sample_orig_2011.parquet and labeled_sample_svcg_2011.parquet: : 2it [00:12,  6.49s/it]


In [16]:
blumenstock_dynamic_types = {'LOAN_SEQUENCE_NUMBER': str, 'MONTHLY_REPORTING_PERIOD': str,'CURRENT_ACTUAL_UPB': float, 'CURRENT_LOAN_DELINQUENCY_STATUS': float, 
                    'CURRENT_INTEREST_RATE':float,'ELTV': float ,'LOAN_AGE': float, 'REMAINING_MONTHS_TO_LEGAL_MATURITY': float, 'CREDIT_SCORE': float,
                    'DTI': float, 'LTV': float, 'BAL_REPAID': float, 
                    'LABEL': str, "TIME_TO_EVENT": float, 'ORIGINAL_INTEREST_RATE': float, 'ORIGINAL_UPB': float, 'TOTAL_OBSERVED_LENGTH': float}

df_blumenstock_dynamic = dd.read_parquet(data_folder + "/blumenstock_dynamic_labeled_sample_orig_*.parquet.gzip")
df_blumenstock_dynamic = df_blumenstock_dynamic.astype(blumenstock_dynamic_types)

In [17]:
df_blumenstock_dynamic.head(20)

Unnamed: 0,CURRENT_INTEREST_RATE,ELTV,CURRENT_ACTUAL_UPB,CURRENT_LOAN_DELINQUENCY_STATUS,MONTHLY_REPORTING_PERIOD,REMAINING_MONTHS_TO_LEGAL_MATURITY,LOAN_AGE,TOTAL_OBSERVED_LENGTH,TIME_TO_EVENT,LABEL,LOAN_SEQUENCE_NUMBER,ORIGINAL_INTEREST_RATE,ORIGINAL_UPB,CREDIT_SCORE,DTI,LTV,BAL_REPAID
0,4.375,0.0,216000.0,0.0,201004,180.0,0.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,1.0
1,4.375,0.0,215000.0,0.0,201005,179.0,1.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,0.99537
2,4.375,0.0,214000.0,0.0,201006,178.0,2.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,0.990741
3,4.375,0.0,213000.0,0.0,201007,177.0,3.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,0.986111
4,4.375,0.0,213000.0,0.0,201008,176.0,4.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,0.986111
5,4.375,0.0,212000.0,0.0,201009,175.0,5.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,0.981481
6,4.375,0.0,211000.0,0.0,201010,174.0,6.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,0.976852
7,4.375,0.0,209976.6,0.0,201011,173.0,7.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,0.972114
8,4.375,0.0,209103.52,0.0,201012,172.0,8.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,0.968072
9,4.375,0.0,208227.26,0.0,201101,171.0,9.0,25.0,25.0,0,F10Q10000014,4.375,216000.0,784.0,38.0,80.0,0.964015


In [18]:
df_perf_first_loan_age = df_blumenstock_dynamic.groupby("LOAN_SEQUENCE_NUMBER").apply(lambda x: x.iloc[0])

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  df_perf_first_loan_age = df_blumenstock_dynamic.groupby("LOAN_SEQUENCE_NUMBER").apply(lambda x: x.iloc[0])


In [19]:
df_perf_first_loan_age[df_perf_first_loan_age["LOAN_AGE"] == 2].compute()

Unnamed: 0_level_0,CURRENT_INTEREST_RATE,ELTV,CURRENT_ACTUAL_UPB,CURRENT_LOAN_DELINQUENCY_STATUS,MONTHLY_REPORTING_PERIOD,REMAINING_MONTHS_TO_LEGAL_MATURITY,LOAN_AGE,TOTAL_OBSERVED_LENGTH,TIME_TO_EVENT,LABEL,LOAN_SEQUENCE_NUMBER,ORIGINAL_INTEREST_RATE,ORIGINAL_UPB,CREDIT_SCORE,DTI,LTV,BAL_REPAID
LOAN_SEQUENCE_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
F10Q10240052,4.500,0.0,76000.0,0.0,201004,178.0,2.0,51.0,51.0,0,F10Q10240052,4.500,77000.0,779.0,41.0,66.0,0.987013
F10Q10304844,5.000,0.0,196000.0,0.0,201004,178.0,2.0,15.0,15.0,0,F10Q10304844,5.000,197000.0,702.0,35.0,60.0,0.994924
F10Q10311213,4.250,0.0,139000.0,0.0,201004,179.0,2.0,38.0,38.0,0,F10Q10311213,4.250,140000.0,816.0,16.0,56.0,0.992857
F10Q10327269,4.250,0.0,139000.0,0.0,201005,178.0,2.0,83.0,83.0,0,F10Q10327269,4.250,140000.0,794.0,34.0,60.0,0.992857
F10Q10327302,4.375,0.0,187000.0,0.0,201005,178.0,2.0,35.0,35.0,0,F10Q10327302,4.375,188000.0,789.0,44.0,76.0,0.994681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F11Q40419971,3.250,0.0,131000.0,0.0,201202,178.0,2.0,108.0,108.0,0,F11Q40419971,3.250,134000.0,769.0,11.0,20.0,0.977612
F11Q40419972,3.375,0.0,307000.0,0.0,201202,178.0,2.0,65.0,65.0,0,F11Q40419972,3.375,310000.0,797.0,16.0,52.0,0.990323
F11Q40424825,3.500,0.0,279000.0,0.0,201203,178.0,2.0,82.0,82.0,0,F11Q40424825,3.500,282000.0,634.0,28.0,90.0,0.989362
F11Q40424831,3.375,0.0,334000.0,0.0,201203,178.0,2.0,118.0,118.0,3,F11Q40424831,3.375,338000.0,775.0,27.0,47.0,0.988166


In [20]:
df_blumenstock_dynamic[df_blumenstock_dynamic["LOAN_SEQUENCE_NUMBER"] == "F10Q10238728"].compute()

Unnamed: 0,CURRENT_INTEREST_RATE,ELTV,CURRENT_ACTUAL_UPB,CURRENT_LOAN_DELINQUENCY_STATUS,MONTHLY_REPORTING_PERIOD,REMAINING_MONTHS_TO_LEGAL_MATURITY,LOAN_AGE,TOTAL_OBSERVED_LENGTH,TIME_TO_EVENT,LABEL,LOAN_SEQUENCE_NUMBER,ORIGINAL_INTEREST_RATE,ORIGINAL_UPB,CREDIT_SCORE,DTI,LTV,BAL_REPAID
100156,4.625,0.0,203000.0,0.0,201004,178.0,2.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.995098
100157,4.625,0.0,202000.0,0.0,201005,177.0,3.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.990196
100158,4.625,0.0,201000.0,0.0,201006,176.0,4.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.985294
100159,4.625,0.0,200000.0,0.0,201007,175.0,5.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.980392
100160,4.625,0.0,200000.0,0.0,201008,174.0,6.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.980392
100161,4.625,0.0,198715.89,0.0,201009,173.0,7.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.974098
100162,4.625,0.0,197905.81,0.0,201010,172.0,8.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.970127
100163,4.625,0.0,197092.61,0.0,201011,171.0,9.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.96614
100164,4.625,0.0,196276.28,0.0,201012,170.0,10.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.962139
100165,4.625,0.0,195456.8,0.0,201101,169.0,11.0,31.0,31.0,0,F10Q10238728,4.625,204000.0,802.0,999.0,89.0,0.958122


In [21]:
df_perf_first_loan_age["LABEL"].value_counts().compute()

0    19621
3     4788
1      205
Name: LABEL, dtype: int64