In [11]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

os.chdir('Q:\\')
df = pd.read_csv('https://raw.githubusercontent.com/mkroberson0208/jupyter-test/main/synthetic_data.csv')
df = df.sort_values('Default_ind')
df.reset_index(inplace=True,drop=True)
df.index = df.index + 1
print(df)

    Face_amt  Default_ind   x1  x2
1         50            0  0.8   8
2         50            0  1.0   5
3         50            0  6.0   6
4         50            0  3.5   1
5         50            0  2.0   2
6         50            0  0.5   1
7         50            0  4.0   2
8         50            0  3.0   3
9         50            0  2.5   4
10       200            1  5.0   6
11        50            1  8.0   2


In [12]:
# Default rate = event incidence (0/1 indicator)
# Loss or charge-off rate = balance lost (0/1 multiplied by $ balance)
d = {}
d['Default Rate'] = df['Default_ind'].sum()/df['Default_ind'].count()
d['Charge-off Rate'] = (df['Face_amt']*df['Default_ind']).sum()/df['Face_amt'].sum()
print(d)

{'Default Rate': 0.18181818181818182, 'Charge-off Rate': 0.35714285714285715}


In [16]:
# weight = % of portfolio balance per row
# Similar to portfolio weights in caluclating risk/return for investments
df['weight'] = df['Face_amt']/df['Face_amt'].sum()

# Logistic model 
#   - Estimated on default event indicator to output default rate

iv = ['x1']
x = np.asarray(df[iv])
y = np.asarray(df['Default_ind'])
model = sm.GLM(endog=y, exog=sm.add_constant(x), family=sm.families.Binomial())
result = model.fit()
print(result.summary())
df['y_hat'] = result.predict(sm.add_constant(x))
print('Predicted charge-off rate: ', (df['Face_amt']*df['y_hat']).sum()/df['Face_amt'].sum())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                   11
Model:                            GLM   Df Residuals:                        9
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2.4147
Date:                Thu, 16 Sep 2021   Deviance:                       4.8294
Time:                        08:24:40   Pearson chi2:                     4.21
No. Iterations:                     7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.9947      4.397     -1.591      0.1

In [17]:
# Balance-weighted logistic model
#   - Estimated on default event indicator re-weighted by % balance to output loss rate
iv = ['x1']
x = np.asarray(df[iv])
y = np.asarray(df['Default_ind'])
model = sm.GLM(endog=y, exog=sm.add_constant(x), family=sm.families.Binomial(),freq_weights=np.asarray(df['weight']))
result = model.fit()
print(result.summary())
df['y_hat'] = result.predict(sm.add_constant(x))
print('Predicted charge-off rate: ', (df['Face_amt']*df['y_hat']).sum()/df['Face_amt'].sum())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                   11
Model:                            GLM   Df Residuals:                       -1
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:               -0.33182
Date:                Thu, 16 Sep 2021   Deviance:                      0.66364
Time:                        09:10:31   Pearson chi2:                    0.772
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.9411     15.563     -0.446      0.6

In [None]:

# Sklearn and statmodels both have options  
sk_model = LogisticRegression(penalty='none')
sk_model.fit(x,y)
print(sk_model.coef_,sk_model.intercept_)
sm_model = sm.GLM(endog=y, exog=sm.add_constant(x), family=sm.families.Binomial())
result = sm_model.fit()
print(result.summary())



sk_model = LogisticRegression(penalty='none')
sk_model.fit(x,y,sample_weight=df['weight'])
print(sk_model.coef_,sk_model.intercept_)
sm_model = sm.GLM(endog=y, exog=sm.add_constant(x), family=sm.families.Binomial(),freq_weights=np.asarray(df['weight']))
result = sm_model.fit()
print(result.summary())

# Sample weights are applied as geometric / ratio to one another
# so the following weights (sum replaced by average balance and multiply by N)
# produce the same relative weighting and results
df['weight'] = 100*(df['Face_amt']/df['Face_amt'].mean())