# Confidence Intervals for Various Quantities

### Section 7.2


In [7]:
import pandas as pd
import numpy as np
from main import preprocess_df
from collections import defaultdict as dd


my_df = pd.read_csv(
    "./datasets/lc_data_2007_to_2018.csv",
    low_memory=False,
    encoding="latin1",
    nrows=100000,  # only looking at 100k rows right now for performance
)
pd.set_option("display.max_columns", None)
cleaned_df = preprocess_df(my_df)

### 95% Confidence Interval for Default Rate of Borrowers With Annual Income Over $100,000


In [2]:
from scipy.stats import t
import math

alpha = 0.05
high_inc_df = cleaned_df.loc[cleaned_df["annual_inc"] > 100000, :]
n = len(high_inc_df)
xbar = high_inc_df["did_default"].mean()

ci1_degrees_of_freedom = n - 1

critical_t_val = t.ppf(1 - alpha / 2, df=ci1_degrees_of_freedom)

s = high_inc_df["did_default"].std()
standard_error = s / math.sqrt(n)

margin_of_error = critical_t_val * standard_error
lower = xbar - margin_of_error
upper = xbar + margin_of_error

print(f"95% CI: [{round(lower, 3)}, {round(upper, 3)}]")

95% CI: [0.148, 0.159]


#### [0.148, 0.159]

This means that if we repeated this sample any number of times, 95% of those times, the default rate of borrowers with annual income over $100,000 would lie within [0.148, 0.159].


### 95% Confidence Interval for the Difference Between the Mean Interest Rates of Defaulted and Non-Defaulted Loans


In [None]:
defaulted_loans_df = cleaned_df.loc[cleaned_df["did_default"], :]
non_defaulted_loans_df = cleaned_df.loc[~cleaned_df["did_default"], :]

defaulted_xbar = defaulted_loans_df["int_rate"].mean()
non_defaulted_xbar = non_defaulted_loans_df["int_rate"].mean()

n = len(defaulted_loans_df)
m = len(non_defaulted_loans_df)

s_d = defaulted_loans_df["int_rate"].std()
s_n = non_defaulted_loans_df["int_rate"].std()

standard_error = math.sqrt(
    s_d**2 / n + s_n**2 / m
)  # using welch's expression assuming unequal variances

t_crit = t.ppf(
    1 - alpha / 2, df=n + m - 2
)  # n + m - 2 is a simplification for df instead of using welch-satterthwaite since n + m >> 100
me = t_crit * standard_error

lower = (defaulted_xbar - non_defaulted_xbar) - me
upper = (defaulted_xbar - non_defaulted_xbar) + me

print(f"95% CI: [{round(lower, 3)}, {round(upper, 3)}]")

# the faster but less informative method:
import scipy.stats as stats

lower, upper = stats.t.interval(
    confidence=0.95,
    df=n + m - 2,
    loc=defaulted_xbar - non_defaulted_xbar,
    scale=standard_error,
)

print(
    f"scipy 95% CI: [{round(lower.astype(float), 3)}, {round(upper.astype(float), 3)}]"
)

95% CI: [3.105, 3.243]
scipy 95% CI: [3.105, 3.243]


#### [3.105, 3.243]

This means that there if we repeated this sample any number of times, for 95% of those samples, the difference in mean interest rate between defaulters and non-defaulters would be within [3.105, 3.243]. And more specifically, the mean interest rate of defaulters would be higher than that of non-defaulters by a value which lies in [3.105, 3.243].


### 95% Confidence Interval for the Difference Between the Default Probabilities of Borrowers with Zero Vs At Least One Delinquency


In [None]:
is_delinquent = cleaned_df["delinq_2yrs"].astype(bool)

delinquent_borrowers = cleaned_df.loc[is_delinquent, :]
non_delinquent_borrowers = cleaned_df.loc[~is_delinquent, :]

delinquent_xbar = delinquent_borrowers["did_default"].mean()
non_delinquent_xbar = non_delinquent_borrowers["did_default"].mean()

xbars_diff = delinquent_xbar - non_delinquent_xbar

n = len(delinquent_borrowers)
m = len(non_delinquent_borrowers)

del_s = delinquent_borrowers["did_default"].std()
non_del_s = non_delinquent_borrowers["did_default"].std()

standard_error = math.sqrt(del_s**2 / n + non_del_s**2 / m)

t_crit = t.ppf(
    1 - alpha / 2, df=n + m - 2
)  # n, m are large (>> 100) therefore the t distribution converges to the normal distribution.
# so using n + m - 2 is a justified approximation for df

me = t_crit * standard_error

lower = xbars_diff - me
upper = xbars_diff + me

print(f"95% CI: [{round(lower, 3)}, {round(upper, 3)}]")

18292 69599
95% CI: [0.01, 0.024]


#### [0.010, 0.024]

This means that if we repeated the sample any number of times, 95% of those times, the difference in mean default rate between people with at least one vs zero delinquencies would lie within [0.010, 0.024]. And more specifically, the mean default rate of people with at least one delinquency would be higher than that of those with zero delinquencies by a value within [0.010, 0.024].
