# Description
The objective of this analysis is to understand the benefits of the use of survival analysis for the modelling of credit risk lifetime default curves.
<br> The standard approach used in the industry is to:
1. Train a classification model in a single target, usually default in 12 months
2. Segment the portfolio in homogeneous risk groups.
3. Calibrate the PD in 12 months over each group.
4. calibrate the lifetime pd curve over each group.

<br> The approach described above breaks down each modelling component in different challenges, the scorecard is only worried with discrimination of the population between clients with low and high risk of defaulting while the calibration of the risk curves deal with both level and shape of the risk curves. This allows for a modular approach, although it implements additional steps in the modelling.
<br> An alternative approach would be using survival analysis methods which allows the modelling of the full lifetime curve in one single model.

This analysis aims to explore the use of survival models to calibrate pd lifetime risk curves

This notebook will create a syntetic set of data upon which the analysis will be done

# TO DO
1. Include default rate plot inside class
2. Check if it is possible to do the default rate by groups inside class
3. check impact of changing variables
4. Include WO rate (LGD)
5. clean notebook

# Setup

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import scipy
from IPython.display import Image

In [2]:
pd.set_option('display.max_rows', 110)
output_path = "data/"

# Data Creation

In [3]:
def fit_power(x, shape, scale):
  return (scale * x ** (-shape))

def fit_exp(x, shape, scale):
  result = scale * np.exp(np.multiply(-shape, x))
  return result

def fit_weibull(x, shape, scale, d):
  result = (shape/scale)*np.power(((x+d)/scale), (shape-1)) * np.exp(-1 * np.power(((x+d)/scale), shape))
  return result

def fit_gamma(x, shape, scale, d=0):
  from scipy.special import gamma
  result = ((1 / (gamma(shape) * (scale ** shape))) * ((x+d) ** (shape - 1)) * np.exp(np.multiply(-1, (x+d)) / scale))
  return result

def logNormFunc(x, a, b, d):
  result = (1 / ((x+d) * b * np.power(2* np.pi, 1/2))) * np.exp(-1*(np.power(np.log(x+d) - a, 2) / (2*np.power(b, 2))))
  return result

def gamma_pdf(x, a, b):
  from scipy.special import gamma
  return (np.power(x, (a - 1)) * np.exp(np.divide(np.multiply(-1, x), b))) / (np.power(b, a) * gamma(a))

def exp_decay(x, a, b):
  return a * np.exp(np.multiply(-b, x))

def beta_pdf(x, a, b):
  from scipy.special import beta
  return np.divide((np.divide(x, np.power(60, (a - 1))) * np.power((1 - np.divide(x, 60)), (b - 1))), beta(a, b))

In [None]:
func_parameters =  dict(
    a = 1.5,
    b = 0.6,
    d = 0
)

risk_scores = {
    10 : 0.01,
     9 : 0.02,
     8 : 0.05,
     7 : 0.10,
     6 : 0.15,
     5 : 0.20,
     4 : 0.25,
     3 : 0.50,
     2 : 0.70,
     1 : 0.90
}

roll_rates = {
    30  :  0.75  , ## from 1-29    to 30-59
    60  :  0.90  , ## from 30-59   to 60-89
    90  :  0.99  , ## from 60-89   to 90-119
    120 :  0.99  , ## from 90-119  to 120-139
    150 :  0.999 , ## from 120-139 to 150-179
    180 :  0.999 , ## from 150-179 to 180-209
    210 :  0.999 , ## from 180-209 to 210-239
    240 :  0.999 , ## from 210-239 to 240-269
    270 :  0.999 , ## from 240-269 to 270-299
    300 :  0.999 , ## from 270-299 to 300-329
    330 :  0.999 , ## from 300-329 to 330-359
    360 :  0.9999, ## from 330-359 to 360-389
    390 :  1.0000, ## from 360-389 to 390-419
}



lifetime_target = 99

def generate_pd_curve(func, 
                      parameters: dict,
                      n_default_months: int,
                      cumulative_pd: float,
                      roll_rates: dict):
    df = pd.DataFrame(range(0, n_default_months+1), columns=["default_month"])
    # df['date'] = pd.date_range(start='2025-01-31', periods=n_default_months+1, freq='ME')
    df['default_month'] = df['default_month']
    df['p'] = func(df['default_month'], **parameters) 
    df['p'] = df['p'].fillna(0)
    df['p'] = (df['p'] / df['p'].sum()) * cumulative_pd
    df['cum_p'] = df['p'].cumsum() 
    multiplication_roll_rates = 1
    for days_late_i, roll_rate_i in roll_rates.items():
        if days_late_i <= 90:
            multiplication_roll_rates = multiplication_roll_rates * roll_rate_i
    df['pd_lifetime'] = (df['p'].sum() - df['cum_p']) * multiplication_roll_rates
    
    # Calculate forward-looking 12-month sum using rolling
    df['p_12m_forward'] = (df[::-1]['p'].rolling(window=12, min_periods=1, closed='left').sum().iloc[::-1]) * multiplication_roll_rates
    return df

def generate_pd_curve_by_risk_score(func,
                                    risk_scores: dict,
                                    parameters: dict,
                                    n_default_months: int,
                                    roll_rates: dict):
    result_dfs = list()
    for risk_band_i, lifetime_pd_i in risk_scores.items():
        parameters['a'] = risk_band_i / 2
        df_temp = generate_pd_curve(func, parameters, n_default_months, lifetime_pd_i, roll_rates)
        df_temp['risk_band'] = risk_band_i
        result_dfs.append(df_temp)
    result = pd.concat(result_dfs)
    return result



data = generate_pd_curve_by_risk_score(logNormFunc, risk_scores, func_parameters, lifetime_target, roll_rates)

data_chart = data.melt(id_vars=['default_month', 'risk_band'])

fig = px.line(
    data_chart,
    x='default_month',
    y='value',
    facet_col='variable',
    color='risk_band',
    template = 'none',
    width=1000
)
fig.update_yaxes(matches=None, showticklabels=True)
# Image(fig.to_image("png"))
fig

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
data_chart = data.melt(id_vars=['default_month', 'risk_band'])

fig = px.line(
    data_chart,
    x='default_month',
    y='value',
    facet_col='risk_band',
    facet_col_wrap=4,
    color='variable',
    template = 'none',
    width=1000
)
fig.update_yaxes(matches=None, showticklabels=True)
# Image(fig.to_image("png"))
fig

In [678]:
func_parameters =  dict(
    a = 1.5,
    b = 0.6,
    d = 0
)

risk_scores = {
    10 : 0.01,
     9 : 0.02,
     8 : 0.05,
     7 : 0.10,
     6 : 0.15,
     5 : 0.20,
     4 : 0.25,
     3 : 0.50,
     2 : 0.70,
     1 : 0.90
}

roll_rates = {
    30  :  0.75  , ## from 1-29    to 30-59
    60  :  0.90  , ## from 30-59   to 60-89
    90  :  0.99  , ## from 60-89   to 90-119
    120 :  0.99  , ## from 90-119  to 120-139
    150 :  0.999 , ## from 120-139 to 150-179
    180 :  0.999 , ## from 150-179 to 180-209
    210 :  0.999 , ## from 180-209 to 210-239
    240 :  0.999 , ## from 210-239 to 240-269
    270 :  0.999 , ## from 240-269 to 270-299
    300 :  0.999 , ## from 270-299 to 300-329
    330 :  0.999 , ## from 300-329 to 330-359
    360 :  0.9999, ## from 330-359 to 360-389
    390 :  1.0000, ## from 360-389 to 390-419
}



lifetime_target = 60
start_date = "2025-01-31"
n_months = 36
n_sample = 10000
risk_scores_dist = {
    10 : 0.1  , 
     9 : 0.15 , 
     8 : 0.2  ,  
     7 : 0.15 ,
     6 : 0.1  ,
     5 : 0.1  ,
     4 : 0.07 ,
     3 : 0.055, 
     2 : 0.05 ,
     1 : 0.025,
}


risk_scores_limits_dist = {
    10 : [15000, 30000],
     9 : [10000, 15000],
     8 : [7000 , 10000],
     7 : [5000 , 7000 ],
     6 : [3000 , 5000 ],
     5 : [1000 , 3000 ],
     4 : [700  , 2000 ],
     3 : [700  , 1000 ],
     2 : [500  , 700  ],
     1 : [100  , 500  ],
}

class generate_sample:
    def __init__(self,
                 func,
                 func_parameters: dict,
                 risk_scores: dict,
                 risk_scores_dist: dict,
                 risk_scores_limits_dist: dict,
                 roll_rates: dict,
                 lifetime_target: int,
                 start_date: str = "2025-01-31",
                 n_months: int = 24,
                 n_sample: int = 10000,
                 wo_target: int = 360,
                 ccf: float = 0.5
                 ):
        self.func = func
        self.func_parameters = func_parameters
        self.risk_scores = risk_scores
        self.risk_scores_dist = risk_scores_dist
        self.roll_rates = roll_rates
        self.lifetime_target = lifetime_target
        self.start_date = start_date
        self.n_months = n_months
        self.n_sample = n_sample
        self.risk_scores_limits_dist = risk_scores_limits_dist
        self.wo_target = wo_target
        self.ccf = ccf
        self.risk_curves = None
    
    def generate_pd_curve(self,
                          func, 
                          parameters: dict,
                          n_default_months: int,
                          cumulative_pd: float,
                          roll_rates: dict):
        """
        Generates a probability distribution curve for default lifetime risk.
        
        Parameters:
            func (callable): Function to calculate PD for given risk-band
            parameters (dict): Dictionary of parameters specific for the function.
            n_default_months (int): Number of months to simulate default progression
            cumulative_pd (float): Cumulative probability of default for the entire curve
            roll_rates (dict): Roll rates between credit states
            
        Returns:
            pd.DataFrame: DataFrame containing PD curves with additional columns
        """
        
        df = pd.DataFrame(range(0, n_default_months+1), columns=["default_month"])
        df['p'] = func(df['default_month'], **parameters) 
        df['p'] = df['p'].fillna(0)
        df['p'] = (df['p'] / df['p'].sum()) * cumulative_pd
        df['cum_p'] = df['p'].cumsum() 
        
        multiplication_roll_rates = 1
        for days_late_i, roll_rate_i in roll_rates.items():
            if days_late_i <= 90:
                multiplication_roll_rates = multiplication_roll_rates * roll_rate_i
                
        df['pd_lifetime'] = (df['p'].sum() - df['cum_p']) * multiplication_roll_rates
        
        # Calculate forward-looking 12-month sum using rolling
        df['pd_12m'] = (df[::-1]['p'].rolling(window=12, min_periods=1, closed='left').sum().iloc[::-1]) * multiplication_roll_rates
        
        return df

    def generate_pd_curve_by_risk_score(self):
        result_dfs = list()
        for risk_band_i, lifetime_pd_i in self.risk_scores.items():
            self.func_parameters['a'] = risk_band_i / 2
            df_temp = self.generate_pd_curve(self.func, 
                                        self.func_parameters, 
                                        self.lifetime_target, 
                                        lifetime_pd_i, 
                                        self.roll_rates)
            df_temp['risk_band'] = risk_band_i
            result_dfs.append(df_temp)
        result = pd.concat(result_dfs)
        self.risk_curves = result
        return self.risk_curves
        
    def generate_origination_sample(self,
                                    start_date: str,
                                    n_months: int, 
                                    n_sample: int,
                                    risk_scores_dist: dict,):
        id_list = range(1, n_sample+1)
        df_sample = pd.DataFrame(id_list, columns=['id'])
        df_sample['create_date_aux'] = np.random.randint(0,n_months, size=n_sample)
        df_sample['created_date'] = pd.to_datetime(pd.to_datetime(start_date) + df_sample['create_date_aux'].apply(lambda x : pd.offsets.MonthEnd(x)))
        df_sample['risk_band'] = np.random.choice(list(risk_scores_dist.keys()), size=n_sample, p=list(risk_scores_dist.values()))
        df_sample = df_sample.drop("create_date_aux", axis="columns")
        return df_sample

    def withRiskCurves(self, df: pd.DataFrame, risk_curves: pd.DataFrame):
        result= df.merge(risk_curves, on=['risk_band'], how='left')
        result['date'] = pd.to_datetime(result['created_date'] + result['default_month'].apply(lambda x: pd.offsets.MonthEnd(int(x))))
        result = result.rename({'default_month':'tenure'}, axis='columns')
        return result

    def withDefaultsFromRiskCurves(self, df: pd.DataFrame, roll_rates: dict):
        result = df.copy()
        result = result.sort_values(['id', 'date'])
        result['is_late'] = np.random.binomial(1, p=result['p'])
        result['late_start_date'] = pd.to_datetime(np.where(result['is_late'] == 1, result['date'], None))
        result['late_start_date'] = result.groupby('id')['late_start_date'].transform('min')
        result['days_late'] = np.where(result['late_start_date'] == result['date'], 1, None)
        for days_late_i, roll_rate_i in roll_rates.items():
            result['days_late_last_month'] = result.groupby('id')['days_late'].transform('shift', 1)
            result['prob_rr'] = roll_rate_i
            result['rgn'] = np.random.binomial(1, p=result['prob_rr'], size=result.shape[0])
            result['days_late'] = np.where(result['days_late_last_month']==np.max([1, days_late_i-30]), 
                                        result['rgn'] *days_late_i, 
                                        result['days_late'])
            if days_late_i != list(roll_rates.keys())[-1]:
                result = result.drop(['days_late_last_month', 'rgn', 'prob_rr'], axis='columns')
        result['days_late'] = result.groupby('id')['days_late'].transform('ffill')
        result['days_late'] = result['days_late'].fillna(0)
        result['is_default'] = np.where(result['days_late'] == 90, 1, 0)
        result['n_late_status'] = result.groupby('id')['is_late'].transform('sum')
        return result
    
    def withMaxCreditLimit(self, df: pd.DataFrame, risk_scores_limits_dist: dict):
        result = df.copy()
        result['credit_card_max_limit'] = result.groupby('id')['risk_band']\
                                                .transform(lambda x: 
                                                    np.repeat(
                                                        np.random.randint(
                                                            risk_scores_limits_dist[x.max()][0] /100,
                                                            risk_scores_limits_dist[x.max()][1] /100
                                                            ) * 100
                                                        ,x.shape))
        return result
    
    def withBalance(self, df: pd.DataFrame):
        def _expit(x):
            return 1 / (1+np.exp(-1 * x))

        def min_max_normalization(X, min_: float = None, max_: float = None):
            if min_ is None:
                min_ = X.min(axis=0)
            if max_ is None:
                max_ = X.max(axis=0)
            X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
            X_scaled = X_std * (max_ - min_) + min_
            return X_scaled
        
        result = df.copy()
        
        result['utilisation'] = np.where(result['default_flag']==1, 
                                         np.random.normal(0.01, 0.5, size=result.shape[0]),
                                         np.random.normal(0.00, 0.5, size=result.shape[0]))
        result['utilisation'] = result.groupby('id')['utilisation'].transform('cumsum')
        result['utilisation'] = result.groupby('id')['utilisation'].transform(min_max_normalization, -4, 5)
        result['utilisation'] = _expit(result['utilisation'])
        result['balance'] = result['utilisation'] * result['credit_card_max_limit']
        result['balance'] = np.where(result['days_late']>30, np.nan, result['balance'])
        result['balance'] = result.groupby('id')['balance'].transform('ffill')
        result['utilisation'] = result['balance'] / result['credit_card_max_limit']
        return result


    def withDefaultFlag(self, df: pd.DataFrame):
        result = df.copy()
        result['default_date_aux'] = pd.to_datetime(np.where(result['is_default']==1, result['date'], None))
        result = result.sort_values(['id', 'date'])
        result['default_date'] = result.groupby('id')['default_date_aux'].transform('bfill')
        result = result.drop('default_date_aux', axis='columns')
        result['default_month'] = pd.to_numeric(np.where(result['default_date'].isnull(), None,
                                        (result['default_date'].dt.to_period('M').astype(int) 
                                            - result['date'].dt.to_period('M').astype(int))))
        result['default_flag'] = np.where(result['default_month'].isnull(),0,1)
        return result
    
    def withDefaultBalance(self, df: pd.DataFrame):
        result = df.copy()
        result['default_balance'] = np.where(result['date'] == result['default_date'], result['balance'], np.nan)
        result['default_balance'] = result['default_balance'].bfill()
        
        result['default_limit'] = np.where(result['date'] == result['default_date'], result['credit_card_max_limit'], np.nan)
        result['default_limit'] = result['default_limit'].bfill()
        return result
    
    def withLGD(self, df: pd.DataFrame, roll_rates: dict, wo_target: int):
        rr_df = pd.DataFrame(roll_rates.values(), index=roll_rates.keys(), columns=['roll_rates']).reset_index(names='days_late')
        rr_df['cum_rr'] = np.where(rr_df['days_late'].between(90, wo_target), rr_df['roll_rates'][::-1].cumprod()[::-1], np.nan)
        rr_df['cum_rr'] = rr_df['cum_rr'].bfill()
        lgd_dict = rr_df.set_index('days_late')[['cum_rr']].to_dict()['cum_rr']

        result = df.copy()
        result['lgd'] = result['days_late'].apply(lambda x: lgd_dict[np.max([90, x])]).fillna(1.0)
        return result
    
    def withEAD(self, df: pd.DataFrame, ccf: float):
        result = df.copy()
        result['ead'] = result['balance'] + ccf * (result['credit_card_max_limit'] - result['balance'])
        return result
    
    def withStages(self, df: pd.DataFrame):
        df['stage'] = np.where(df['days_late'] >= 90, 3,
                               np.where(df['days_late'] >= 30, 2, 1))
        return df
    
    def withECL(self, df: pd.DataFrame):
        result = df.copy()
        result['ecl_12m'] = result['pd_12m'] * result['ead'] * result['lgd']
        result['ecl_lifetime'] = result['pd_lifetime'] * result['ead'] * result['lgd']
        result['ecl'] = np.where(result['stage']==1, result['ecl_12m'], result['ecl_lifetime'])
        result['pd'] = np.where(result['stage']==1, result['pd_12m'], result['pd_lifetime'])
        return result
    
    def filterFutureDates(self, df: pd.DataFrame):
        result = df[df['date'] <= df['created_date'].max(axis='rows')]
        return result

    def filterAlreadyDefaulted(self, df: pd.DataFrame):
        filter_df = (df['default_month']>0) | (df['default_month'].isnull())
        result = df[filter_df]
        return result

    def withCompleteDefaultMonths(self, df: pd.DataFrame, lifetime: int):
        default_months = pd.DataFrame(range(1,lifetime+1), columns=['complete_default_months'])
        id_df = df[['date', 'id']].drop_duplicates()\
                    .merge(default_months, how='cross')
        
        result = id_df.merge(df, on=['date', 'id'], how='outer')
        result['default_flag'] = np.where(result['default_month'] == result['complete_default_months'], 1, 0)
        result = result\
                    .drop('default_month', axis='columns')\
                    .rename({'complete_default_months':'default_month'}, axis='columns')
        result['cum_defaults'] = result.sort_values(['date', 'id']).groupby(['date', 'id'])['default_flag'].transform('cumsum')
        return result
    
    def plot_default_rates(self):
        data_chart = self.df_sample\
            .groupby(['date','default_month'])\
            .agg({
                'id' : 'count',
                'cum_defaults': 'mean',
                'default_flag': 'mean'
            }) \
            .reset_index()\
            .melt(id_vars=['date', 'default_month'])


        fig = px.line(
            data_chart,
            x='default_month',
            y='value',
            facet_col='variable',
            color='date',
            template='none',
            width=1000
        )

        fig.update_yaxes(matches=None, showticklabels=True)

    def plot_risk_curves(self, plot_as_png: bool = False):
        if self.risk_curves is None:
            self.generate_pd_curve_by_risk_score()
        data_chart = data.melt(id_vars=['default_month', 'risk_band'])
        fig = px.line(
            data_chart,
            x='default_month',
            y='value',
            facet_col='variable',
            color='risk_band',
            template = 'none',
            width=1000
        )
        fig.update_yaxes(matches=None, showticklabels=True)

        if plot_as_png:
            return Image(fig.to_image('png'))
        else:
            return fig
        
    def plot_risk_curves2(self, plot_as_png: bool = False):
        if self.risk_curves is None:
            self.generate_pd_curve_by_risk_score()
        data_chart = data.melt(id_vars=['default_month', 'risk_band'])
        fig = px.line(
            data_chart,
            x='default_month',
            y='value',
            facet_col='risk_band',
            facet_col_wrap=4,
            color='variable',
            template = 'none',
            width=1000
        )
        fig.update_yaxes(matches=None, showticklabels=True)

        if plot_as_png:
            return Image(fig.to_image('png'))
        else:
            return fig

    def withSelectedDF(self):
        return self.df_sample[self.selected_cols]
    
    def run(self):
        self.generate_pd_curve_by_risk_score()

        self.df_sample = self.generate_origination_sample(start_date, n_months, n_sample, risk_scores_dist)\
                            .pipe(self.withRiskCurves, self.risk_curves)\
                            .pipe(self.filterFutureDates)\
                            .pipe(self.withDefaultsFromRiskCurves, self.roll_rates)\
                            .pipe(self.withDefaultFlag)\
                            .pipe(self.withMaxCreditLimit, self.risk_scores_limits_dist)\
                            .pipe(self.withBalance)\
                            .pipe(self.withDefaultBalance)\
                            .pipe(self.withLGD, self.roll_rates, self.wo_target)\
                            .pipe(self.withStages)\
                            .pipe(self.withEAD, self.ccf)\
                            .pipe(self.withECL)
        
        return self.df_sample
    
    



In [679]:
sample_generator = \
    generate_sample(func = logNormFunc,
                    func_parameters = func_parameters,
                    risk_scores = risk_scores,
                    risk_scores_dist = risk_scores_dist,
                    risk_scores_limits_dist = risk_scores_limits_dist,
                    roll_rates = roll_rates,
                    lifetime_target = lifetime_target,
                    start_date = start_date,
                    n_months = n_months,
                    n_sample = n_sample)

df_sample = sample_generator.run()
df_sample
                


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


Adding/subtracting object-dtype array to DatetimeArray not vectorized.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



Unnamed: 0,id,created_date,risk_band,tenure,p,cum_p,pd_lifetime,pd_12m,date,is_late,...,balance,default_balance,default_limit,lgd,stage,ead,ecl_12m,ecl_lifetime,ecl,pd
0,1,2027-01-31,6,0,0.000000e+00,0.000000e+00,0.100238,0.022245,2027-01-31,0,...,4767.874316,468.124165,500.0,0.973163,1,4783.937158,103.563326,466.660528,103.563326,0.022245
1,1,2027-01-31,6,1,3.843816e-07,3.843816e-07,0.100237,0.026321,2027-02-28,0,...,4753.603485,468.124165,500.0,0.973163,1,4776.801742,122.357142,465.963292,122.357142,0.026321
2,1,2027-01-31,6,2,3.180629e-05,3.219067e-05,0.100216,0.030408,2027-03-31,0,...,4677.875509,468.124165,500.0,0.973163,1,4738.937755,140.236134,462.171748,140.236134,0.030408
3,1,2027-01-31,6,3,2.267964e-04,2.589871e-04,0.100064,0.034339,2027-04-30,0,...,3986.035193,468.124165,500.0,0.973163,1,4393.017597,146.802911,427.787462,146.802911,0.034339
4,1,2027-01-31,6,4,6.928793e-04,9.518664e-04,0.099601,0.037885,2027-05-31,0,...,3595.943363,468.124165,500.0,0.973163,1,4197.971682,154.772651,406.902529,154.772651,0.037885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609962,10000,2025-09-30,9,23,1.715288e-04,9.916054e-04,0.012702,0.002520,2027-08-31,0,...,1843.631079,,,0.973163,1,7621.815539,18.693356,94.216771,18.693356,0.002520
609963,10000,2025-09-30,9,24,1.926720e-04,1.184277e-03,0.012574,0.002694,2027-09-30,0,...,2403.951156,,,0.973163,1,7901.975578,20.718698,96.689859,20.718698,0.002694
609964,10000,2025-09-30,9,25,2.143808e-04,1.398658e-03,0.012430,0.002867,2027-10-31,0,...,1848.444752,,,0.973163,1,7624.222376,21.268729,92.228293,21.268729,0.002867
609965,10000,2025-09-30,9,26,2.365053e-04,1.635163e-03,0.012272,0.003036,2027-11-30,0,...,632.163220,,,0.973163,1,7016.081610,20.731531,83.792672,20.731531,0.003036


In [681]:
filter_df = (df_sample['days_late']<90) & (df_sample['default_month']<0)
df_sample[filter_df][['id', 'date', 'default_date', 'default_month', 'days_late', 'is_default', 'default_flag', 'stage']]

Unnamed: 0,id,date,default_date,default_month,days_late,is_default,default_flag,stage


In [674]:
df_sample[df_sample['id']==3901][['id', 'date', 'default_date', 'default_month', 'days_late', 'is_default', 'default_flag', 'stage']]

Unnamed: 0,id,date,default_date,default_month,days_late,is_default,default_flag,stage
237900,3901,2026-02-28,2026-09-30,7.0,0,0,1,1
237901,3901,2026-03-31,2026-09-30,6.0,0,0,1,1
237902,3901,2026-04-30,2026-09-30,5.0,0,0,1,1
237903,3901,2026-05-31,2026-09-30,4.0,0,0,1,1
237904,3901,2026-06-30,2026-09-30,3.0,1,0,1,1
237905,3901,2026-07-31,2026-09-30,2.0,30,0,1,2
237906,3901,2026-08-31,2026-09-30,1.0,60,0,1,2
237907,3901,2026-09-30,2026-09-30,0.0,90,1,1,3
237908,3901,2026-10-31,2026-09-30,-1.0,0,0,1,1
237909,3901,2026-11-30,2026-09-30,-2.0,0,0,1,1


In [698]:
from utils import colorRampPaletteFromDfColumn

data_chart = df_sample[df_sample['days_late']<90]\
.groupby(['date', 'default_month'], dropna=False)\
.agg({'id':'count',
      'default_balance':'sum',
      'default_flag':'sum',
      'balance': 'sum'})\
.reset_index()

data_chart['id'] = data_chart.groupby('date')['id'].transform('sum')
data_chart['default_rate'] = data_chart['default_flag'] / data_chart['id']
data_chart['cum_default'] = data_chart.groupby('date')['default_rate'].transform('cumsum')

data_chart['balance'] = data_chart.groupby('date')['balance'].transform('sum')
data_chart['default_rate_bal'] = data_chart['default_balance'] / data_chart['balance']
data_chart['cum_default_rate_bal'] = data_chart.groupby('date')['default_rate_bal'].transform('cumsum')
data_chart = data_chart[['date', 
                         'default_month', 
                         'id', 
                         'balance', 
                         'default_rate',
                         'cum_default',
                         'default_rate_bal',
                         'cum_default_rate_bal']]\
                  .melt(id_vars=['date', 'default_month'])
data_chart = data_chart[~data_chart['default_month'].isnull()]

data_chart
fig = px.line(data_chart,
              y='value',
              x='default_month',
              facet_col='variable',
              facet_col_wrap=2,
              color=data_chart['date'].dt.strftime('%Y-%b'),
              template='none',
              width=1000,
              height=1000,
              color_discrete_sequence=colorRampPaletteFromDfColumn(data_chart, 'date', ['#bfd1e2', 
                                                                                        '#284369']))
fig.update_yaxes(matches=None, showticklabels=True)

In [658]:
df_sample.columns

Index(['id', 'created_date', 'risk_band', 'tenure', 'p', 'cum_p',
       'pd_lifetime', 'p_12m_forward', 'date', 'is_late', 'late_start_date',
       'days_late', 'days_late_last_month', 'prob_rr', 'rgn', 'is_default',
       'n_late_status', 'default_date', 'default_month', 'default_flag',
       'credit_card_max_limit', 'utilisation', 'balance', 'lgd'],
      dtype='object')

In [73]:
df_sample[df_sample['default_flag']==1]

Unnamed: 0,id,created_date,risk_band,tenure,p,cum_p,pd_lifetime,p_12m_forward,date,is_late,...,prob_rr,rgn,is_default,n_late_status,default_date,default_month,default_flag,credit_card_max_limit,utilisation,balance
800,9,2026-07-31,3,0,0.000000,0.000000,3.341250e-01,0.319587,2026-07-31,0,...,1.0,1,0,1,2027-02-28,7.0,1,700,0.992608,694.825921
801,9,2026-07-31,3,1,0.014605,0.014605,3.243655e-01,0.313364,2026-08-31,0,...,1.0,1,0,1,2027-02-28,6.0,1,700,0.997762,698.433506
802,9,2026-07-31,3,2,0.067289,0.081894,2.793995e-01,0.271016,2026-09-30,0,...,1.0,1,0,1,2027-02-28,5.0,1,700,0.999447,699.613055
803,9,2026-07-31,3,3,0.088584,0.170478,2.202032e-01,0.213770,2026-10-31,0,...,1.0,1,0,1,2027-02-28,4.0,1,700,0.999775,699.842628
804,9,2026-07-31,3,4,0.081620,0.252098,1.656603e-01,0.160692,2026-11-30,1,...,1.0,1,0,1,2027-02-28,3.0,1,700,0.999816,699.87115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998195,9982,2025-05-31,5,95,0.000004,0.199987,8.996072e-06,0.000009,2033-04-30,0,...,1.0,1,0,1,2026-09-30,-79.0,1,1700,0.032295,1017.769022
998196,9982,2025-05-31,5,96,0.000004,0.199990,6.506885e-06,0.000007,2033-05-31,0,...,1.0,1,0,1,2026-09-30,-80.0,1,1700,0.130108,1017.769022
998197,9982,2025-05-31,5,97,0.000003,0.199994,4.185833e-06,0.000004,2033-06-30,0,...,1.0,1,0,1,2026-09-30,-81.0,1,1700,0.574443,1017.769022
998198,9982,2025-05-31,5,98,0.000003,0.199997,2.020642e-06,0.000002,2033-07-31,0,...,1.0,1,0,1,2026-09-30,-82.0,1,1700,0.214165,1017.769022


In [76]:
df_sample[df_sample['id']==1]

Unnamed: 0,id,created_date,risk_band,tenure,p,cum_p,pd_lifetime,p_12m_forward,date,is_late,...,prob_rr,rgn,is_default,n_late_status,default_date,default_month,default_flag,credit_card_max_limit,utilisation,balance
0,1,2025-03-31,8,0,0.0,0.0,0.0334125,0.000277,2025-03-31,0,...,1.0,1,0,0,NaT,,0,8100,0.450166,3646.344622
1,1,2025-03-31,8,1,8.825335e-12,8.825335e-12,0.0334125,0.000393,2025-04-30,0,...,1.0,1,0,0,NaT,,0,8100,0.8320184,6739.34892
2,1,2025-03-31,8,2,5.008135e-09,5.01696e-09,0.0334125,0.000537,2025-05-31,0,...,1.0,1,0,0,NaT,,0,8100,0.5744425,4652.984386
3,1,2025-03-31,8,3,1.101391e-07,1.151561e-07,0.03341242,0.000711,2025-06-30,0,...,1.0,1,0,0,NaT,,0,8100,0.8021839,6497.689497
4,1,2025-03-31,8,4,7.481954e-07,8.633515e-07,0.03341192,0.000914,2025-07-31,0,...,1.0,1,0,0,NaT,,0,8100,0.9568927,7750.831235
5,1,2025-03-31,8,5,2.822706e-06,3.686058e-06,0.03341004,0.001146,2025-08-31,0,...,1.0,1,0,0,NaT,,0,8100,0.7109495,5758.690971
6,1,2025-03-31,8,6,7.537597e-06,1.122365e-05,0.033405,0.001406,2025-09-30,0,...,1.0,1,0,0,NaT,,0,8100,0.5986877,4849.370047
7,1,2025-03-31,8,7,1.60918e-05,2.731545e-05,0.03339425,0.001691,2025-10-31,0,...,1.0,1,0,0,NaT,,0,8100,0.2689414,2178.425513
8,1,2025-03-31,8,8,2.942715e-05,5.674261e-05,0.03337458,0.001997,2025-11-30,0,...,1.0,1,0,0,NaT,,0,8100,0.4013123,3250.629953
9,1,2025-03-31,8,9,4.809729e-05,0.0001048399,0.03334244,0.002319,2025-12-31,0,...,1.0,1,0,0,NaT,,0,8100,0.05732418,464.325825


In [70]:
data_chart = df_sample\
    .groupby(['date','default_month'])\
    .agg({
        'id' : 'count',
        'cum_defaults': 'mean',
        'default_flag': 'mean'
    }) \
    .reset_index()\
    .melt(id_vars=['date', 'default_month'])


fig = px.line(
    data_chart,
    x='default_month',
    y='value',
    facet_col='variable',
    color='date',
    template='none',
    width=1000
)

fig.update_yaxes(matches=None, showticklabels=True)
# Image(fig.to_image("png", width=1000))


In [8]:
df_sample.to_parquet(f"{output_path}syntetic_sample.parquet")