In [2]:
# Import relevant libraries
from scipy import stats
import scipy as sp
import pandas as pd
import numpy as np
import random
import math

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Set seed for any random operations
random.seed(1)

In [4]:
# Read in test dataset (TODO: test data being used is temporary. replace with final test data.)

train = pd.read_csv("sorted_train0314.csv")
test = pd.read_csv("sorted_test0314.csv")
val = pd.read_csv("sorted_val0314.csv")

In [5]:
# Number of simulations we want
N = 1000

In [6]:
# event time simulation
def get_default_time(loan, hazard_model, thre):
    # sample from a uniform distrbution
    u = np.random.rand()
    t = hazard_model(loan) # get the distribution
    idx = np.array(t.index.tolist())
    val = np.array(t.values.tolist())
    idx = np.reshape(idx, (len(idx), 1))
    dist = np.concatenate((idx, val), axis=1)
    def search(u, dist, thre):
        l = 0
        r = len(dist)-1
        while l < r and r > 0 and l < len(dist)-1:
            mid = l + (r - l)/2
            if abs(dist[mid][1] - u) < thre:
                return dist[mid][0]
            elif dist[mid][1] - u> 0:
                l = mid + 1
            else:
                r = mid - 1
            
    default_time = search(u, dist, thre)
    
    return default_time

# to use :
# t = get_default_time(x_train_death[15:16].drop(['DaysToDefault','Default?'], axis = 1),\
#                      cph.predict_survival_function, 0.001)
# returns number of days - can be saved in duration column for predicting loss ratio

In [None]:
# Calculates loss of a loan using loss distribution model
def calculate_loss(loan):
    # PLUG IN LOSS DISTRIBUTION MODEL HERE
    return 100000

In [None]:
# Simulates total loss of 500 loans for a t year horizon
def simulate_total_loss(N, years):
    total_losses = []
    total_loan_amounts = []
    for n in range(0, N):
        sampled_loans = test_data.sample(500)
        cur_loss = 0.0
        
        for i in range(0, 500):
            cur_loan = sampled_loans[i:i+1]
            if is_defaulted_by(cur_loan, years=1):
                loan_loss = calculate_loss(cur_loan)
                cur_loss += loan_loss
        
        total_losses.append(cur_loss)
        total_loan_amounts.append(sampled_loans['GrossApproval'].sum())
    
    return total_losses, total_loan_amounts

In [None]:
# QUESTION: Should we remove samples where no loans have defaulted? Are we focused on 
# only loan pools that have defaults or all loan pools in general?

In [None]:
# Estimate for 1 year and plot histogram
total_1_year_losses, total_1_year_loan_amounts = simulate_total_loss(N, 1)
sns.distplot(total_1_year_losses)

In [None]:
# Repeat for 5 year and plot histogram
total_5_year_losses, total_5_year_loan_amounts = simulate_total_loss(N, 5)
sns.distplot(total_5_year_losses)

In [71]:
# Calculate VaR of sorted list of values
def get_VAR(sorted_losses, level):
    index = int((1-level) * 500)
    return sorted_losses[index]

In [72]:
# Calculate Average VaR of sorted list of values
def get_avg_VAR(sorted_losses, level):
    index = int((1-level) * 500)
    return np.mean(sorted_losses[:index+1])

In [None]:
# Calculate confidence interval
def get_conf_interval(sample, level=0.95):
    n, min_max, mean, var, skew, kurt = stats.describe(sample)
    std = math.sqrt(var)
    return stats.norm.interval(level, loc=mean, scale=std)

In [None]:
# Bootstrap to calculate VaR and Average VaR with confidence intervals
def bootstrap_vars(N, total_losses, level=0.95):
    losses_df = pd.Series(total_losses)
    VaR_samples = []
    AVaR_samples = []
    
    for i in range(0, N):
        bootstrap_sample = sorted(losses_df.sample(500, replace=True).tolist())
        VaR = get_VAR(bootstrap_sample, level)
        AVaR = get_avg_VAR(bootstrap_sample, level)
        VaR_samples.append(VaR)
        AVaR_samples.append(AVaR)
    
    print "VaR estimate (mean):", np.mean(VaR_samples)
    print "Average VaR estimate (mean):", np.mean(AVaR_samples)
    print "VaR confidence interval:", get_conf_interval(VaR_samples, level)
    print "AVaR confidence interval:", get_conf_interval(AVaR_samples, level)

In [None]:
# Number of bootstrap simulations
B = 1000

# Print results
print "1-year loss at 95% level:"
bootstrap_vars(B, total_1_year_losses, 0.95)
print ""
print "1-year loss at 99% level:"
bootstrap_vars(B, total_1_year_losses, 0.99)
print ""
print "5-year loss at 95% level:"
bootstrap_vars(B, total_5_year_losses, 0.95)
print ""
print "5-year loss at 99% level:"
bootstrap_vars(B, total_5_year_losses, 0.99)

In [None]:
# Find junior [5%, 15%] and senior tranches [15%, 100%] for 1 year
junior_tranche = []
senior_tranche = []

for i in range(0, N):
    loan_amount = total_1_year_loan_amounts[i]
    loss_amount = total_1_year_losses[i]
    
    loss_percentage = float(loss_amount) / loan_amount
    junior_loss_percentage = 0.0
    senior_loss_percentage = 0.0
    
    # Loss for junior tranche
    if loss_percentage >= 0.05:
        if loss_percentage >= 0.15:
            junior_loss_percentage = 1.0  # Lost it all
        else:
            junior_loss_percentage = (loss_percentage - 0.05) / (0.15 - 0.05)
    
    # Loss for senior tranche
    if loss_percentage >= 0.15:
        if loss_percentage >= 1.0:
            senior_loss_percentage = 1.0  # Lost it all
        else:
            senior_loss_percentage = (loss_percentage - 0.15) / (1.0 - 0.15)
        
    junior_tranche.append(junior_loss_percentage)
    senior_tranche.append(senior_loss_percentage)

In [None]:
# Plot junior histogram
j = sns.distplot(junior_tranche)
sns.despine()
j.axes.set_title('Junior Tranche', fontsize=20, alpha=0.7)

In [None]:
# Plot senior histogram
s = sns.distplot(senior_tranche)
sns.despine()
s.axes.set_title('Senior Tranche', fontsize=20, alpha=0.7)

In [None]:
# Repeat for 5 year
junior_tranche = []
senior_tranche = []

for i in range(0, N):
    loan_amount = total_5_year_loan_amounts[i]
    loss_amount = total_5_year_losses[i]
    
    loss_percentage = float(loss_amount) / loan_amount
    junior_loss_percentage = 0.0
    senior_loss_percentage = 0.0
    
    # Loss for junior tranche
    if loss_percentage >= 0.05:
        if loss_percentage >= 0.15:
            junior_loss_percentage = 1.0  # Lost it all
        else:
            junior_loss_percentage = (loss_percentage - 0.05) / (0.15 - 0.05)
    
    # Loss for senior tranche
    if loss_percentage >= 0.15:
        if loss_percentage >= 1.0:
            senior_loss_percentage = 1.0  # Lost it all
        else:
            senior_loss_percentage = (loss_percentage - 0.15) / (1.0 - 0.15)
        
    junior_tranche.append(junior_loss_percentage)
    senior_tranche.append(senior_loss_percentage)

In [None]:
# Plot junior histogram
j = sns.distplot(junior_tranche)
sns.despine()
j.axes.set_title('Junior Tranche', fontsize=20, alpha=0.7)

In [None]:
# Plot senior histogram
s = sns.distplot(senior_tranche)
sns.despine()
s.axes.set_title('Senior Tranche', fontsize=20, alpha=0.7)