#### data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform

In [None]:
train = pd.read_pickle('./data/010_train.pkl')  # train dataset, not cleaned
train.columns

Index(['loanSequenceNumber', 'monthlyReportingPeriod', 'currentActualUpb',
       'currentLoanDelinquencyStatus', 'loanAge',
       'remainingMonthsToLegalMaturity', 'defectSettlementDate',
       'modificationFlag', 'zeroBalanceCode', 'zeroBalanceEffectiveDate',
       'currentInterestRate', 'currentNonInterestBearingUpb',
       'dueDateOfLastPaidInstallment', 'miRecoveries', 'netSaleProceeds',
       'nonMiRecoveries', 'totalExpenses', 'legalCosts',
       'maintenanceAndPreservationCosts', 'taxesAndInsurance',
       'miscellaneousExpenses', 'actualLossCalculation',
       'cumulativeModificationCost', 'stepModificationFlag', 'paymentDeferral',
       'estimatedLoanToValue', 'zeroBalanceRemovalUpb',
       'delinquentAccruedInterest', 'delinquencyDueToDisaster',
       'borrowerAssistanceStatusCode', 'currentMonthModificationCost',
       'interestBearingUpb', 'target', 'upbPctChange', 'nonPmts_3m',
       'delinquencyDueToDisaster_hist', 'interestBearingUpb_ratio',
       'creditS

create target variable, LGD

In [None]:
# denominator cannot be currentActualUpb or interestBearingUpb, mostly 0 post-default
train['LGD'] = train['actualLossCalculation'] / train['zeroBalanceRemovalUpb']
train.loc[train.LGD < 0, 'LGD'] = 0    # trim negative LGD, which is a gain not a loss

defaulted = train.loc[train['LGD'].notna()]  # for training, 964rows

processing

In [None]:
# features not relevant or not known at the prediction time
var1 = ['loanSequenceNumber', 'defectSettlementDate', 'miRecoveries',
       'netSaleProceeds', 'nonMiRecoveries', 'totalExpenses', 'legalCosts', 
       'maintenanceAndPreservationCosts', 'taxesAndInsurance', 'miscellaneousExpenses',
       'actualLossCalculation', 'target','preReliefRefinanceLoanSeqNumber']

# drop features only have one unqiue value
var2 = [col for col in defaulted.columns 
        if defaulted[col].nunique() == 1 and defaulted[col].notna().all()]

# high cardinality, considering 'defaulted' only has 964 rows for training, or use target encoding
var3 = ['seller_name', 'servicer_name']

defaulted = defaulted.drop(columns=np.unique(var1 + var2 + var3))

In [5]:
# nulls are a category by themselves, thus replace with values.
defaulted['modificationFlag'] = defaulted['modificationFlag'].fillna('N')

# too many real nulls, drop
defaulted = defaulted.drop(columns=['cumulativeModificationCost', 'stepModificationFlag', 'areaCode'])

defaulted['paymentDeferral'] = defaulted['paymentDeferral'].fillna('N')
defaulted['delinquencyDueToDisaster'] = defaulted['delinquencyDueToDisaster'].fillna('N')
defaulted['borrowerAssistanceStatusCode'] = defaulted['borrowerAssistanceStatusCode'].fillna('N')
defaulted['currentMonthModificationCost'] = defaulted['currentMonthModificationCost'].fillna(0)
defaulted['upbPctChange'] = defaulted['upbPctChange'].fillna(0)
defaulted['superConformingFlag'] = defaulted['superConformingFlag'].fillna('N')
defaulted['reliefRefinanceIndicator'] = defaulted['reliefRefinanceIndicator'].fillna('N') 

In [6]:
# more cleaning
defaulted['propertyValuationMethod'] = defaulted.propertyValuationMethod.astype(str)
defaulted['zeroBalanceCode'] = defaulted.zeroBalanceCode.astype(str)
defaulted = defaulted.drop(columns='postalCode')

defaulted['estimatedLoanToValue'] = defaulted.estimatedLoanToValue.replace(999, np.nan)
defaulted['creditScore'] = defaulted['creditScore'].replace(to_replace=9999, value= np.nan)
defaulted['originalDebtToIncomeRatio'] = defaulted['originalDebtToIncomeRatio'].replace(to_replace = 999, value = np.nan)

In [7]:
defaulted.to_csv('./data/040_defaulted.csv', index=False)

#### apply the same processing to the test set

In [8]:
test = pd.read_pickle('./data/010_test.pkl')  # dataset, not cleaned
test.columns

Index(['loanSequenceNumber', 'monthlyReportingPeriod', 'currentActualUpb',
       'currentLoanDelinquencyStatus', 'loanAge',
       'remainingMonthsToLegalMaturity', 'defectSettlementDate',
       'modificationFlag', 'zeroBalanceCode', 'zeroBalanceEffectiveDate',
       'currentInterestRate', 'currentNonInterestBearingUpb',
       'dueDateOfLastPaidInstallment', 'miRecoveries', 'netSaleProceeds',
       'nonMiRecoveries', 'totalExpenses', 'legalCosts',
       'maintenanceAndPreservationCosts', 'taxesAndInsurance',
       'miscellaneousExpenses', 'actualLossCalculation',
       'cumulativeModificationCost', 'stepModificationFlag', 'paymentDeferral',
       'estimatedLoanToValue', 'zeroBalanceRemovalUpb',
       'delinquentAccruedInterest', 'delinquencyDueToDisaster',
       'borrowerAssistanceStatusCode', 'currentMonthModificationCost',
       'interestBearingUpb', 'target', 'upbPctChange', 'nonPmts_3m',
       'delinquencyDueToDisaster_hist', 'interestBearingUpb_ratio',
       'creditS

In [None]:
# denominator cannot be currentActualUpb or interestBearingUpb, mostly 0
test['LGD'] = test['actualLossCalculation'] / test['zeroBalanceRemovalUpb']
test.loc[test.LGD < 0, 'LGD'] = 0    # trim negative LGD, which is a gain not a loss

In [10]:
defaulted_test = test.loc[test['LGD'].notna()]

In [None]:
defaulted_test = defaulted_test.drop(columns=np.unique(var1 + var2 + var3))

In [12]:
# nulls are a category by themselves, thus replace with values.
defaulted_test['modificationFlag'] = defaulted_test['modificationFlag'].fillna('N')

# too many real nulls, drop
defaulted_test = defaulted_test.drop(columns=['cumulativeModificationCost', 'stepModificationFlag', 'areaCode'])

defaulted_test['paymentDeferral'] = defaulted_test['paymentDeferral'].fillna('N')
defaulted_test['delinquencyDueToDisaster'] = defaulted_test['delinquencyDueToDisaster'].fillna('N')
defaulted_test['borrowerAssistanceStatusCode'] = defaulted_test['borrowerAssistanceStatusCode'].fillna('N')
defaulted_test['currentMonthModificationCost'] = defaulted_test['currentMonthModificationCost'].fillna(0)
defaulted_test['upbPctChange'] = defaulted_test['upbPctChange'].fillna(0)
defaulted_test['superConformingFlag'] = defaulted_test['superConformingFlag'].fillna('N')
defaulted_test['reliefRefinanceIndicator'] = defaulted_test['reliefRefinanceIndicator'].fillna('N') 

In [13]:
# more cleaning
defaulted_test['propertyValuationMethod'] = defaulted_test.propertyValuationMethod.astype(str)
defaulted_test['zeroBalanceCode'] = defaulted_test.zeroBalanceCode.astype(str)
defaulted_test = defaulted_test.drop(columns='postalCode')

defaulted_test['estimatedLoanToValue'] = defaulted_test.estimatedLoanToValue.replace(999, np.nan)
defaulted_test['creditScore'] = defaulted_test['creditScore'].replace(to_replace=9999, value= np.nan)
defaulted_test['originalDebtToIncomeRatio'] = defaulted_test['originalDebtToIncomeRatio'].replace(to_replace = 999, value = np.nan)

In [14]:
defaulted_test.to_csv('./data/040_defaulted_test.csv', index=False)

#### OOT

In [15]:
full = pd.read_pickle('./data/010_full.pkl')  # full dataset, not cleaned
oot = full[full.monthlyReportingPeriod == 202406]  # oot
oot.columns

Index(['loanSequenceNumber', 'monthlyReportingPeriod', 'currentActualUpb',
       'currentLoanDelinquencyStatus', 'loanAge',
       'remainingMonthsToLegalMaturity', 'defectSettlementDate',
       'modificationFlag', 'zeroBalanceCode', 'zeroBalanceEffectiveDate',
       'currentInterestRate', 'currentNonInterestBearingUpb',
       'dueDateOfLastPaidInstallment', 'miRecoveries', 'netSaleProceeds',
       'nonMiRecoveries', 'totalExpenses', 'legalCosts',
       'maintenanceAndPreservationCosts', 'taxesAndInsurance',
       'miscellaneousExpenses', 'actualLossCalculation',
       'cumulativeModificationCost', 'stepModificationFlag', 'paymentDeferral',
       'estimatedLoanToValue', 'zeroBalanceRemovalUpb',
       'delinquentAccruedInterest', 'delinquencyDueToDisaster',
       'borrowerAssistanceStatusCode', 'currentMonthModificationCost',
       'interestBearingUpb', 'target', 'upbPctChange', 'nonPmts_3m',
       'delinquencyDueToDisaster_hist', 'interestBearingUpb_ratio',
       'creditS

In [16]:
oot['LGD'] = oot['actualLossCalculation'] / oot['zeroBalanceRemovalUpb']
oot.loc[oot.LGD < 0, 'LGD'] = 0    # trim negative LGD, which is a gain not a loss

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oot['LGD'] = oot['actualLossCalculation'] / oot['zeroBalanceRemovalUpb']


In [None]:
oot = oot.drop(columns=np.unique(var1 + var2 + var3))

In [18]:
# nulls are a category by themselves, thus replace with values.
oot['modificationFlag'] = oot['modificationFlag'].fillna('N')

# too many real nulls, drop
oot = oot.drop(columns=['cumulativeModificationCost', 'stepModificationFlag', 'areaCode'])

oot['paymentDeferral'] = oot['paymentDeferral'].fillna('N')
oot['delinquencyDueToDisaster'] = oot['delinquencyDueToDisaster'].fillna('N')
oot['borrowerAssistanceStatusCode'] = oot['borrowerAssistanceStatusCode'].fillna('N')
oot['currentMonthModificationCost'] = oot['currentMonthModificationCost'].fillna(0)
oot['upbPctChange'] = oot['upbPctChange'].fillna(0)
oot['superConformingFlag'] = oot['superConformingFlag'].fillna('N')
oot['reliefRefinanceIndicator'] = oot['reliefRefinanceIndicator'].fillna('N') 

# more cleaning
oot['propertyValuationMethod'] = oot.propertyValuationMethod.astype(str)
oot['zeroBalanceCode'] = oot.zeroBalanceCode.astype(str)
oot = oot.drop(columns='postalCode')

oot['estimatedLoanToValue'] = oot.estimatedLoanToValue.replace(999, np.nan)
oot['creditScore'] = oot['creditScore'].replace(to_replace=9999, value= np.nan)
oot['originalDebtToIncomeRatio'] = oot['originalDebtToIncomeRatio'].replace(to_replace = 999, value = np.nan)

In [19]:
oot.to_csv('./data/040_oot.csv', index=False)