In [1]:
# Import relevant libraries
from scipy import stats
import scipy as sp
import pandas as pd
import numpy as np
import random
import math

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Set seed for any random operations
random.seed(1)

In [2]:
# Read in test dataset (TODO: test data being used is temporary. replace with final test data.)
train_data = pd.read_csv("sorted_hazard_train0314.csv")
test_data = pd.read_csv("sorted_hazard_test0314.csv")
print "Number of test points:", test_data.shape[0]

Number of test points: 5481


In [None]:
# Do the usual filtering and matching to train data
c = train.columns.tolist()
c.remove('LoanStatus')
c.remove('ChargeOffDate')
c.remove('GrossChargeOffAmount')
c.remove('BorrZip')
c.remove('CDC_Zip')
c.remove('BorrCity')

c.remove('BorrState')
c.remove('CDC_State')
c.remove('ThirdPartyLender_State')
c.remove('ProjectState')

c.remove('CDC_City')
c.remove('ProjectCounty')
c.remove('ThirdPartyLender_City')
c.remove('ApprovalDate')

print c

x_train = train[c]
print x_train.columns.tolist()
x_train = pd.get_dummies(x_train).astype(float)
print x_train.shape

x_test = test[c]
print x_test.columns.tolist()
x_test = pd.get_dummies(x_test).astype(float)
print x_test.shape

In [None]:
# to get consistent feature dimensions for both train and test dataset
def add_missing_dummy_columns(d1, d2):
    missing_cols = set(d1.columns) - set(d2.columns)
    for c in missing_cols:
        d2[c] = 0
    return d2

print 'before fix columns: '
print x_train.shape
print x_test.shape

def fix_columns(x_train, x_test):  

    x_test = add_missing_dummy_columns(x_train, x_test)

    extra_cols = set(x_test.columns) - set(x_train.columns)
    x_test = x_test[x_train.columns]
    return x_test

x_test = fix_columns(x_train, x_test)
print 'after fix columns: '

In [None]:
# Filter out certain attributes from a loan covariate for loss model
def filter_loan(loan):  
    after_loss = loan.columns.tolist()
    after_loss.remove('GrossApproval')
    after_loss.remove('Log_GrossApproval_Norm')
    after_loss.remove('Default?')
    return loan[after_loss]

In [12]:
# Number of simulations we want
N = 1000

In [3]:
# Predicts if loan is defaulted by time t in years
def is_defaulted_by(loan, years):
    # GIVEN LAMBDA
    default_prob = # Hand-calculate or calculate from hazard model
    u = np.random.uniform(0, 1)
    return (u <= default_prob)

SyntaxError: invalid syntax (<ipython-input-3-aa393f9270e9>, line 4)

In [None]:
# Calculates loss of a loan using loss distribution model,
# assuming model is stored in loss_model
def calculate_loss(loan):
    gross_approval = loan['GrossApproval']
    filtered_loan = filter_loan(loan)
    pred = max(0, loss_model.predict(filtered_loan))
    return pred * gross_approval

In [None]:
# Simulates total loss of 500 loans for a t year horizon
def simulate_total_loss(N, years):
    total_losses = []
    total_loan_amounts = []
    for n in range(0, N):
        sampled_loans = test_data.sample(500)
        cur_loss = 0.0
        
        for i in range(0, 500):
            cur_loan = sampled_loans[i:i+1]
            if is_defaulted_by(cur_loan, years=1):
                loan_loss = calculate_loss(cur_loan)
                cur_loss += loan_loss
        
        total_losses.append(cur_loss)
        total_loan_amounts.append(sampled_loans['GrossApproval'].sum())
    
    return total_losses, total_loan_amounts

In [None]:
# QUESTION: Should we remove samples where no loans have defaulted? Are we focused on 
# only loan pools that have defaults or all loan pools in general?

In [None]:
# Estimate for 1 year and plot histogram
total_1_year_losses, total_1_year_loan_amounts = simulate_total_loss(N, 1)
sns.distplot(total_1_year_losses)

In [None]:
# Repeat for 5 year and plot histogram
total_5_year_losses, total_5_year_loan_amounts = simulate_total_loss(N, 5)
sns.distplot(total_5_year_losses)

In [71]:
# Calculate VaR of sorted list of values
def get_VAR(sorted_losses, level):
    index = int((1-level) * 500)
    return sorted_losses[index]

In [72]:
# Calculate Average VaR of sorted list of values
def get_avg_VAR(sorted_losses, level):
    index = int((1-level) * 500)
    return np.mean(sorted_losses[:index+1])

In [None]:
# Calculate confidence interval
def get_conf_interval(sample, level=0.95):
    n, min_max, mean, var, skew, kurt = stats.describe(sample)
    std = math.sqrt(var)
    return stats.norm.interval(level, loc=mean, scale=std)

In [None]:
# Bootstrap to calculate VaR and Average VaR with confidence intervals
def bootstrap_vars(N, total_losses, level=0.95):
    losses_df = pd.Series(total_losses)
    VaR_samples = []
    AVaR_samples = []
    
    for i in range(0, N):
        bootstrap_sample = sorted(losses_df.sample(500, replace=True).tolist())
        VaR = get_VAR(bootstrap_sample, level)
        AVaR = get_avg_VAR(bootstrap_sample, level)
        VaR_samples.append(VaR)
        AVaR_samples.append(AVaR)
    
    print "VaR estimate (mean):", np.mean(VaR_samples)
    print "Average VaR estimate (mean):", np.mean(AVaR_samples)
    print "VaR confidence interval:", get_conf_interval(VaR_samples, level)
    print "AVaR confidence interval:", get_conf_interval(AVaR_samples, level)

In [None]:
# Number of bootstrap simulations
B = 1000

# Print results
print "1-year loss at 95% level:"
bootstrap_vars(B, total_1_year_losses, 0.95)
print ""
print "1-year loss at 99% level:"
bootstrap_vars(B, total_1_year_losses, 0.99)
print ""
print "5-year loss at 95% level:"
bootstrap_vars(B, total_5_year_losses, 0.95)
print ""
print "5-year loss at 99% level:"
bootstrap_vars(B, total_5_year_losses, 0.99)

In [None]:
# Find junior [5%, 15%] and senior tranches [15%, 100%] for 1 year
junior_tranche = []
senior_tranche = []

for i in range(0, N):
    loan_amount = total_1_year_loan_amounts[i]
    loss_amount = total_1_year_losses[i]
    
    loss_percentage = float(loss_amount) / loan_amount
    junior_loss_percentage = 0.0
    senior_loss_percentage = 0.0
    
    # Loss for junior tranche
    if loss_percentage >= 0.05:
        if loss_percentage >= 0.15:
            junior_loss_percentage = 1.0  # Lost it all
        else:
            junior_loss_percentage = (loss_percentage - 0.05) / (0.15 - 0.05)
    
    # Loss for senior tranche
    if loss_percentage >= 0.15:
        if loss_percentage >= 1.0:
            senior_loss_percentage = 1.0  # Lost it all
        else:
            senior_loss_percentage = (loss_percentage - 0.15) / (1.0 - 0.15)
        
    junior_tranche.append(junior_loss_percentage)
    senior_tranche.append(senior_loss_percentage)

In [None]:
# Plot junior histogram
j = sns.distplot(junior_tranche)
sns.despine()
j.axes.set_title('Junior Tranche', fontsize=20, alpha=0.7)

In [None]:
# Plot senior histogram
s = sns.distplot(senior_tranche)
sns.despine()
s.axes.set_title('Senior Tranche', fontsize=20, alpha=0.7)

In [None]:
# Repeat for 5 year
junior_tranche = []
senior_tranche = []

for i in range(0, N):
    loan_amount = total_5_year_loan_amounts[i]
    loss_amount = total_5_year_losses[i]
    
    loss_percentage = float(loss_amount) / loan_amount
    junior_loss_percentage = 0.0
    senior_loss_percentage = 0.0
    
    # Loss for junior tranche
    if loss_percentage >= 0.05:
        if loss_percentage >= 0.15:
            junior_loss_percentage = 1.0  # Lost it all
        else:
            junior_loss_percentage = (loss_percentage - 0.05) / (0.15 - 0.05)
    
    # Loss for senior tranche
    if loss_percentage >= 0.15:
        if loss_percentage >= 1.0:
            senior_loss_percentage = 1.0  # Lost it all
        else:
            senior_loss_percentage = (loss_percentage - 0.15) / (1.0 - 0.15)
        
    junior_tranche.append(junior_loss_percentage)
    senior_tranche.append(senior_loss_percentage)

In [None]:
# Plot junior histogram
j = sns.distplot(junior_tranche)
sns.despine()
j.axes.set_title('Junior Tranche', fontsize=20, alpha=0.7)

In [None]:
# Plot senior histogram
s = sns.distplot(senior_tranche)
sns.despine()
s.axes.set_title('Senior Tranche', fontsize=20, alpha=0.7)