# Confidence Intervals for Various Quantities

### Section 7.2


### 95% Confidence Interval for Default Rate of Borrowers With Annual Income Over $100,000


In [1]:
import pandas as pd
import numpy as np
from main import preprocess_df
from collections import defaultdict as dd


my_df = pd.read_csv(
    "./datasets/lc_data_2007_to_2018.csv",
    low_memory=False,
    encoding="latin1",
    nrows=100000,  # only looking at 100k rows right now for performance
)
pd.set_option("display.max_columns", None)
cleaned_df = preprocess_df(my_df)

In [2]:
from scipy.stats import t
import math

alpha = 0.05
high_inc_df = cleaned_df.loc[cleaned_df["annual_inc"] > 100000, :]
n = len(high_inc_df)
xbar = high_inc_df["did_default"].mean()

ci1_degrees_of_freedom = n - 1

critical_t_val = t.ppf(1 - alpha / 2, df=ci1_degrees_of_freedom)

s = high_inc_df["did_default"].std()
standard_error = s / math.sqrt(n)

margin_of_error = critical_t_val * standard_error
lower = xbar - margin_of_error
upper = xbar + margin_of_error

print(f"95% CI: [{round(lower, 3)}, {round(upper, 3)}]")

95% CI: [0.148, 0.159]


### 95% Confidence Interval for the Difference Between the Mean Interest Rates of Defaulted and Non-Defaulted Loans


In [None]:
import scipy.stats as stats

defaulted_loans_df = cleaned_df.loc[cleaned_df["did_default"], :]
non_defaulted_loans_df = cleaned_df.loc[~cleaned_df["did_default"], :]

defaulted_xbar = defaulted_loans_df["int_rate"].mean()
non_defaulted_xbar = non_defaulted_loans_df["int_rate"].mean()

n = len(defaulted_loans_df)
m = len(non_defaulted_loans_df)

s_d = defaulted_loans_df["int_rate"].std()
s_n = non_defaulted_loans_df["int_rate"].std()

standard_error = math.sqrt(
    s_d**2 / n + s_n**2 / m
)  # using welch's expression assuming unequal variances

t_crit = t.ppf(
    1 - alpha / 2, df=n + m - 2
)  # n + m - 2 is a simplification for df instead of using welch-satterthwaite since n + m >> 100
me = t_crit * standard_error

lower = (defaulted_xbar - non_defaulted_xbar) - me
upper = (defaulted_xbar - non_defaulted_xbar) + me

print(f"95% CI: [{round(lower, 3)}, {round(upper, 3)}]")

# the faster but less informative method:
t_stat, p_val = stats.ttest_ind(
    defaulted_loans_df["int_rate"], non_defaulted_loans_df["int_rate"], equal_var=False
)
print(t_stat, p_val)

95% CI: [3.105, 3.243]
90.28998542764488 0.0


### 95% Confidence Interval for the Difference Between the Default Probabilities of Borrowers with Zero Vs At Least One Delinquency
