perform data cleaning to the test set

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from scipy.stats import entropy
import scorecardpy as sc
%matplotlib inline 

In [2]:
test = pd.read_pickle('./data/010_test.pkl')

#### missing values

In [None]:
# missing values impute using median
test['creditScore'] = test['creditScore'].replace(to_replace=9999,
                                                  value = test['creditScore'].median())

test['originalDebtToIncomeRatio'] = test['originalDebtToIncomeRatio'].replace(to_replace=999,
                                                                              value = test['originalDebtToIncomeRatio'].median())

# due to high mutual information value w/ postalCode
test = test.drop(columns='areaCode') 

# due to 0.98 correlation with originalLoanToValue
test = test.drop(columns='originalCombinedLoanToValue') 

# not useful
test = test.drop(columns=['sellerName', 'servicerName'])

# superConformingFlag
# according to the user guide, nulls are actully Not Super Conforming
test['superConformingFlag'] = test['superConformingFlag'].fillna('N')

# this is a seq number which has no meaning & 98% are missing, so drop it
test.drop(columns='preReliefRefinanceLoanSeqNumber', inplace=True)

# nulls are actually 'N', as opposed to 'Y' for this feature
test['reliefRefinanceIndicator'] = test['reliefRefinanceIndicator'].fillna('N') 

# miscallenous
# '9' means Not available or not applicable, replace for readability
test['programIndicator'] = test['programIndicator'].replace(to_replace='9', value='NA')
test['propertyValuationMethod'] = test['propertyValuationMethod'].replace(9, 'NA')

In [None]:
test = test.drop(columns=['zeroBalanceCode', 'zeroBalanceEffectiveDate']) # have been used to create the target, see document

# nulls are a category by themselves, thus replace with values.
test['modificationFlag'] = test['modificationFlag'].fillna('N')

test['stepModificationFlag'] = test['stepModificationFlag'].fillna('NotModified')

test['paymentDeferral'] = test['paymentDeferral'].fillna('N')

test['borrowerAssistanceStatusCode'] = test['borrowerAssistanceStatusCode'].fillna('N')

test['delinquencyDueToDisaster'] = test['delinquencyDueToDisaster'].fillna('N')

# not useful for now, drop for ease of use
test = test.drop(columns=['defectSettlementDate','miRecoveries',
                          'netSaleProceeds', 'nonMiRecoveries', 'actualLossCalculation',
                          'zeroBalanceRemovalUpb', 'delinquentAccruedInterest'])

test = test.drop(columns=['totalExpenses','legalCosts',
                          'maintenanceAndPreservationCosts', 'taxesAndInsurance',
                          'miscellaneousExpenses'])

idx = test[test['cumulativeModificationCost'].notnull()].index[3]
test.loc[idx-6: idx+4, ['loanSequenceNumber', 'monthlyReportingPeriod',
                         'modificationFlag', 'paymentDeferral',
                         'cumulativeModificationCost', 'currentMonthModificationCost']]

Unnamed: 0,loanSequenceNumber,monthlyReportingPeriod,modificationFlag,paymentDeferral,cumulativeModificationCost,currentMonthModificationCost
7458,F14Q10190473,202101,P,N,-953.02,-27.15
7459,F14Q10190515,202101,N,N,,


In [None]:
test = test.drop(columns=['cumulativeModificationCost'])
test['currentMonthModificationCost'] = test['currentMonthModificationCost'].fillna(0)

In [None]:
test[['loanSequenceNumber',
       'remainingMonthsToLegalMaturity', 'dueDateOfLastPaidInstallment',
       'firstPaymentDate', 'maturityDate', 'originalLoanTerm']].loc[43:76]

In [None]:
# this variabe's in formation is caputured by maturityDate, so drop
test = test.drop(columns=['dueDateOfLastPaidInstallment'])

# estimatedLoanToValue 999 means Unknown
# if currentActualUpb==0, then it is likely Unknown
# not vice versa
print(sum(test.estimatedLoanToValue == 999))
print(sum(test.currentActualUpb == 0))
print(sum((test.currentActualUpb == 0) & (test.estimatedLoanToValue == 999)))

test['estimatedLoanToValue'] = test['estimatedLoanToValue'].replace(to_replace = 999,
                                                                    value = test['estimatedLoanToValue'].median())

# clean the variables just created
test['upbPctChange'] = test['upbPctChange'].fillna(0)

test.isnull().sum()

#### Outliers

In [None]:
numeric_cols = test.select_dtypes(include='number').columns

def vis_features_hist(df, cate_cols, first):
    last = first + 15
    fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(10, 10))
    for i, column in enumerate(df.columns[first:last]):
        ax = axes[i // 3, i % 3]
        if column in cate_cols:
            sns.countplot(data=df, x=column, ax=axes[i//3, i%3])  #row, column
        else:
            sns.histplot(data=df, x=column, hue='target', kde=True, ax=axes[i//3, i%3])
        ax.set_ylabel('')  # Remove y-axis label

    for j in range(i + 1, 15):
        fig.delaxes(axes[j//3, j%3])  #delete empty subplots
    plt.tight_layout()
    plt.subplots_adjust(wspace=0.2, hspace=0.4)  # reduce space between subplots
    plt.show()
    
vis_features_hist(test[numeric_cols], [], 0)

In [None]:
vis_features_hist(test[numeric_cols], [], 15)

In [None]:
test = test.loc[test['interestBearingUpb_ratio'] > 0.4]

# currentActualUpb, remove outliers
test = test.loc[test['currentActualUpb'] < 1250000]  

test.isnull().sum()

In [None]:
# save the cleaned dataset
test.to_pickle("./data/011_test_cleaned.pkl")

#### apply the existing WoE binning to the test set

In [None]:
test = pd.read_pickle("./data/011_test_cleaned.pkl")
test.isnull().any()

loanSequenceNumber                False
monthlyReportingPeriod            False
currentActualUpb                  False
currentLoanDelinquencyStatus      False
loanAge                           False
remainingMonthsToLegalMaturity    False
modificationFlag                  False
currentInterestRate               False
currentNonInterestBearingUpb      False
stepModificationFlag              False
paymentDeferral                   False
estimatedLoanToValue              False
delinquencyDueToDisaster          False
borrowerAssistanceStatusCode      False
currentMonthModificationCost      False
interestBearingUpb                False
target                            False
upbPctChange                      False
nonPmts_3m                        False
delinquencyDueToDisaster_hist     False
interestBearingUpb_ratio          False
creditScore                       False
firstPaymentDate                  False
firstTimeHomebuyerFlag            False
maturityDate                      False


In [10]:
test = test.drop(columns=['loanSequenceNumber', 'monthlyReportingPeriod'])

# for binning to work well:
test['target'] = test['target'].astype(int)

# Convert object (string) and category type columns to string
for col in test.select_dtypes(include=['object', 'category']).columns:
    test[col] = test[col].astype(str)

# these are below 0.02
vars = [
    'delinquencyDueToDisaster_hist', 
    'superConformingFlag',
    'programIndicator',
    'channel',
    'currentMonthModificationCost',
    'interestBearingUpb_ratio',
    'reliefRefinanceIndicator',
    'paymentDeferral',
    'stepModificationFlag',
    'modificationFlag',
    'numberOfUnits',
    'borrowerAssistanceStatusCode',
    'currentNonInterestBearingUpb',
    'delinquencyDueToDisaster',
    'currentLoanDelinquencyStatus'
    ]
test = test.drop(columns=vars)

# these only have 1 unique value
vars= [
    'prepaymentPenaltyMortgageFlag',
    'amortizationType',
    'interestOnlyIndicator'
]
test = test.drop(columns=vars)

In [11]:
import pickle
with open('./data/010_bins.pkl', 'rb') as file:
    bins = pickle.load(file)

test_woe = sc.woebin_ply(test, bins)
test_woe.head()

[INFO] converting into woe values ...
Woe transformating on 369339 rows and 26 columns in 00:00:15


Unnamed: 0,target,creditScore_woe,firstTimeHomebuyerFlag_woe,originalDebtToIncomeRatio_woe,firstPaymentDate_woe,currentActualUpb_woe,postalCode_woe,propertyState_woe,originalUpb_woe,loanAge_woe,...,currentInterestRate_woe,originalLoanTerm_woe,propertyType_woe,upbPctChange_woe,maturityDate_woe,interestBearingUpb_woe,nonPmts_3m_woe,loanPurpose_woe,remainingMonthsToLegalMaturity_woe,originalLoanToValue_woe
3,0,0.243703,-0.230913,-0.923512,0.914414,-0.246671,-0.232034,-0.362608,-0.293348,0.781824,...,1.136875,0.14583,-0.261624,-1.706176,1.017115,-0.256712,-1.474355,-0.480342,0.881992,-0.146579
4,0,0.243703,-0.230913,-0.923512,0.914414,-0.246671,-0.232034,-0.362608,-0.293348,0.781824,...,1.136875,0.14583,-0.261624,-1.706176,1.017115,-0.256712,-1.474355,-0.480342,0.881992,-0.146579
13,0,0.243703,-0.230913,-0.923512,0.914414,-0.246671,-0.232034,-0.362608,-0.293348,0.781824,...,1.136875,0.14583,-0.261624,-1.706176,1.017115,-0.256712,-1.474355,-0.480342,0.881992,-0.146579
14,0,0.243703,-0.230913,-0.923512,0.914414,-0.246671,-0.232034,-0.362608,-0.293348,0.781824,...,1.136875,0.14583,-0.261624,-1.706176,1.017115,-0.256712,-1.474355,-0.480342,0.881992,-0.146579
16,0,0.243703,-0.230913,-0.923512,0.914414,-0.246671,-0.232034,-0.362608,-0.293348,0.781824,...,1.136875,0.14583,-0.261624,-1.706176,1.017115,-0.256712,-1.474355,-0.480342,0.881992,-0.146579


####    filter accoridng to correlation, consistent with training

In [12]:
test_woe = test_woe.drop(columns=['interestBearingUpb_woe',
                        'postalCode_woe', 'originalInterestRate_woe'])

test_woe.to_pickle("./data/011_test_woe.pkl")

In [13]:
test_woe.columns

Index(['target', 'creditScore_woe', 'firstTimeHomebuyerFlag_woe',
       'originalDebtToIncomeRatio_woe', 'firstPaymentDate_woe',
       'currentActualUpb_woe', 'propertyState_woe', 'originalUpb_woe',
       'loanAge_woe', 'miCancellationIndicator_woe', 'occupancyStatus_woe',
       'numberOfBorrowers_woe', 'propertyValuationMethod_woe',
       'estimatedLoanToValue_woe', 'mortgageInsurancePct_woe',
       'currentInterestRate_woe', 'originalLoanTerm_woe', 'propertyType_woe',
       'upbPctChange_woe', 'maturityDate_woe', 'nonPmts_3m_woe',
       'loanPurpose_woe', 'remainingMonthsToLegalMaturity_woe',
       'originalLoanToValue_woe'],
      dtype='object')