In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt
from scipy.special import expit

from sklearn.preprocessing import PolynomialFeatures

# DGP

In [2]:
def sample_sparse_coefs(nx, p=0.8):
            return np.random.uniform(-5, 5, nx)*np.random.binomial(1, p, nx)
    
    
def sample_from(dist, mu, var, n):
    if dist == "gamma":
        return np.random.gamma(mu**2/var, np.abs(var/mu), n)
    
    if dist == "normal":
        return np.random.normal(mu, np.sqrt(var), n)
    
    if dist == "uniform":
        a = mu-np.sqrt(3 * var)
        return np.random.uniform(a, 2*mu - a, n)


def risk_fn(risk_latent, bias=0, temp=0.1):
    return expit(risk_latent*temp + bias)


def default_fn(risk):
    return np.random.binomial(1, risk)

    
class LendingWorld(object):
    
    
    def __init__(self, nx=10, poly=2, bias=1, temp=0.1, seed=123):
        
        np.random.seed(seed)

        self.NX = nx
        self.POLY = poly
        self.POLY_GEN = PolynomialFeatures(self.POLY, include_bias=False)
        
        self.MUS = np.random.uniform(-2, 2, self.NX)
        self.VARS = np.random.exponential(1, self.NX)
        self.DISTS = np.random.choice(["gamma", "normal", "uniform"], self.NX)
        
        self.NX_FINAL = self.POLY_GEN.fit_transform(self.MUS.reshape(1, -1)).shape[1]
        
        self.RISK_COEFS = sample_sparse_coefs(self.NX_FINAL)
        self.RISK_SENS_COEFS = sample_sparse_coefs(self.NX_FINAL)
        
        self.temp = temp
        self.bias = bias
        
        self.features = [f"col_{i+1}" for i in range(self.NX)]

        
    def sample_features(self, n):
        X = np.hstack([sample_from(dist, mu, var, n).reshape(-1, 1)
                       for dist, mu, var in zip(self.DISTS, self.MUS, self.VARS)])
        X_POLY = self.POLY_GEN.fit_transform(X)
        return X, X_POLY


    def sample_risk_sens_fn(self, x):
        # exp to make it always positive
        return 1.02**np.random.normal(x.dot(self.RISK_SENS_COEFS))
    

    def risk_latent_fn(self, x, price, risk_sens):
        return x.dot(self.RISK_COEFS) + risk_sens*price

    
    def sample_loans(self, price):
        n = len(price)
        
        X, X_poly = self.sample_features(n)
        
        risk_sens = self.sample_risk_sens_fn(X_poly)
        risk_latent = self.risk_latent_fn(X_poly, price, risk_sens)
        risk = risk_fn(risk_latent, temp=self.temp, bias=self.bias)

        default = default_fn(risk)
        
        return pd.DataFrame(X, columns=self.features).assign(**dict(
            price=price,
            default=default,
        ))

        
        

world = LendingWorld(seed=123)

world.sample_loans(np.random.uniform(1, 10, 100))

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,price,default
0,1.776821,-1.798182,1.088089,-0.272398,0.507269,3.059987e-01,2.469058,0.372393,-0.698128,-0.304407,9.739415,0
1,0.536216,-1.148946,2.550642,0.167954,0.071114,7.514399e-06,2.591652,1.006679,-0.341900,-1.203612,8.845146,0
2,0.476573,-2.055554,1.038969,-0.543870,0.789111,1.709208e-03,1.580569,1.304355,0.543337,0.330968,7.391455,0
3,0.300209,-0.206512,1.121739,0.052226,0.466705,4.584388e-02,2.897849,1.428070,-0.080443,-1.810499,9.626588,1
4,0.250499,-0.269244,1.709581,-0.300204,0.058138,1.048450e-01,1.783522,0.562187,-0.423615,-0.560499,4.868320,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,2.586428,-2.752909,0.835800,-0.061169,0.766349,1.186320e-15,1.692913,1.083594,1.662002,0.402438,9.583274,0
96,1.436564,-0.182737,1.778212,0.161746,1.731232,1.695310e-10,1.555653,0.810256,0.241925,0.053061,6.920336,1
97,1.404595,0.668027,0.485463,0.038237,1.466304,4.967896e-02,1.555503,1.037499,-0.324816,0.383174,7.955900,1
98,3.005673,2.069762,0.289005,0.211481,0.821055,8.201614e-04,1.661328,0.420324,-0.305403,-1.229489,7.195369,1
