In [None]:
def univ(df,remlist=list()):
    import pandas as pd
    import numpy as np
    out=df.select_dtypes(include=np.number).columns
    univlist=list(set(out)-set(remlist))
    df_numeric=df[univlist]
    des1=df_numeric.describe(percentiles=[0.0,0.005,0.01,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,0.9,0.99,0.999,1],include = 'all').T
    des2 =((df_numeric.isnull().sum())*100/df_numeric.shape[0]).to_frame(name ='missing')
    df1=pd.concat([des1, des2],axis=1)
    df2=df1.drop('count', axis=1)
    return df2

In [None]:
def KS_table(actual, pred , return_ks_table = True,num_deciles=10):
    import pandas as pd
    import numpy as np
    data = pd.DataFrame({'target':actual,'prob':pred})
    data['target0'] = 1 - data['target']
    q   = data['prob'].quantile(np.linspace(0,1,num_deciles+1))
    c   = np.unique(q)
    data['bucket'] =  pd.cut(data['prob'],c,include_lowest=True,labels=range(1,len(c)))
    grouped = data.groupby('bucket', as_index = False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()['prob']
    kstable['max_prob'] = grouped.max()['prob']
    kstable['avg_prob'] = grouped.mean()['prob']
    kstable['events']=grouped.sum()['target']
    kstable['nonevents'] = grouped.sum()['target0']
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop = True)
    kstable['event_rate'] = (kstable.events / data['target'].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2%}'.format)
    kstable['cum_eventrate']=(kstable.events / data['target'].sum()).cumsum()
    kstable['cum_noneventrate']=(kstable.nonevents / data['target0'].sum()).cumsum()
    kstable['bad_rate']=kstable.events*100/(kstable.events+kstable.nonevents)
    kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 5) * 100
    #Formating
    kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2%}'.format)
    kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2%}'.format)
    kstable.index = range(num_deciles,0,-1)
    kstable.index.rename('Decile', inplace=True)
    kstable.reset_index(inplace = True)
    kstable.sort_values('Decile', ascending = True, inplace = True)
    kstable.reset_index(drop = True, inplace = True)
    #Display KS
    from colorama import Fore
    if return_ks_table :
        print(Fore.RED + "KS is " + str(round(max(kstable['KS']),2))+"%"+ " at decile " + str((kstable[kstable['KS']==max(kstable['KS'])]['Decile'].values)))
        return(kstable)
    else :
        return round(max(kstable['KS']),2)
        

In [None]:
def iv_woe(data, target, bins=10, show_woe=False):
    import pandas as pd
    import numpy as np
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
        d0 = d0.astype({"x": str})
        d = d0.groupby("x", as_index=False, dropna=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Non-Events']/d['% of Events'])
        d['IV'] = d['WoE'] * (d['% of Non-Events']-d['% of Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        #print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))

In [None]:
def calculate_psi_continuous(dataframe1, dataframe2,features, bins=10):
    import pandas as pd
    import numpy as np
    PSI = pd.DataFrame(columns = ['Feature','Value'])  
    cols = features
    for i in cols:
        # Divide the range of the variable into equal-width bins
        expected = dataframe1[i]
        actual = dataframe2[i]
        breakpoints = np.linspace(min(expected), max(expected), bins+1)
        # Calculate the observed and expected frequencies for each bin
        expected_hist, _ = np.histogram(expected, bins=breakpoints)
        actual_hist, _ = np.histogram(actual, bins=breakpoints)
        # Calculate the proportions within each bin
        expected_prop = expected_hist / len(expected)
        actual_prop = actual_hist / len(actual)
        # Avoid zero numerator or denominator errors
        epsilon = 1e-10
        actual_prop[actual_prop == 0] = epsilon
        expected_prop[expected_prop == 0] = epsilon
        # Calculate the PSI value
        psi = np.sum((actual_prop - expected_prop) * np.log(actual_prop / expected_prop))
        temp = pd.DataFrame([{'Feature': i, 'Value': psi}])
        PSI= pd.concat([temp, PSI])
    return PSI

In [None]:
def calculate_psi_categorical(dataframe1, dataframe2,features):
    import pandas as pd
    import numpy as np
    PSI = pd.DataFrame(columns = ['Feature','Value'])  
    cols = features
    for i in cols:
        expected = dataframe1[i]
        actual = dataframe2[i]
    # Create frequency tables for expected and actual values
        expected_freq = pd.Series(expected).value_counts(normalize=True)
        actual_freq = pd.Series(actual).value_counts(normalize=True)

        # Merge the frequency tables
        freq_table = pd.concat([expected_freq, actual_freq], axis=1, keys=['Expected', 'Actual']).fillna(0)
        freq_table[freq_table['Actual'] == 0] = 0.000001
        freq_table[freq_table['Expected'] == 0] = 0.000000001
        # Calculate the PSI value
        psi = np.sum((freq_table['Actual'] - freq_table['Expected']) * np.log(freq_table['Actual'] / freq_table['Expected']))
        temp = pd.DataFrame([{'Feature': i, 'Value': psi}])
        PSI= pd.concat([temp, PSI])
    return PSI

In [None]:
def bivariate(data,cols,target_var,bins=10):
    import pandas as pd
    import numpy as np
    k=1
    for i in cols:
        var = data[i].values
        q   = np.nanquantile(var,np.linspace(0,1,bins+1))
        c   = list(np.unique(q))
        c1  = [-np.inf]+c[1:-1]+[np.inf]
        data_cut = pd.cut(var,c1,include_lowest=True,labels=range(1,len(c1)))
        data_cut1=np.where(data_cut.isna(),"missing",data_cut)
        tar_data = data[target_var].values
        df = pd.DataFrame({'bin':data_cut1,'value':data[i].values,'target':tar_data})
        records = df.shape[0]
        df_grp=df.groupby('bin').agg(
        total=('target','count'),
        events=('target','sum'),
        min_value = ('value','min'),
        max_value = ('value','max'),
        ).reset_index()
        df_grp['variable']=i
        df_grp['event_rate']=df_grp['events']/df_grp['total']
        df_grp['non_events']=df_grp['total'] - df_grp['events']
        df_grp['non_events_perc']= (df_grp['non_events'])/records
        df_grp['events_perc']= (df_grp['events'])/records
        df_grp['pop_perc']= (df_grp['total'])/records
        df_bivar = df_grp[["variable","bin","min_value","max_value","total","events","events_perc","non_events","non_events_perc","pop_perc","event_rate"]]    
        if k==1:
            data_out=df_bivar.copy()
        else:
            data_out=pd.concat([data_out,df_bivar],axis=0).copy()
        k = k+1
    return data_out

In [None]:
def woe_bivariate(train,test,target_var,cols,bins=10):
    def cal_woe(df,var1):
        records = df.shape[0]
        df_grp=df.groupby('dec').agg(
        total=('target','count'),
        events=('target','sum'),
        min_value = ('value','min'),
        max_value = ('value','max'),
        ).reset_index()
        df_grp['variable']=var1
        df_grp['event_rate']=df_grp['events']/df_grp['total']
        df_grp['non_events']=df_grp['total'] - df_grp['events']
        df_grp['non_events_perc']= (df_grp['non_events']*100)/records
        df_grp['events_perc']= (df_grp['events']*100)/records
        df_grp['pop_perc']= (df_grp['total']*100)/records
        df_grp['neg_per']=df_grp['non_events']/df_grp['non_events'].sum()
        df_grp['pos_per']=df_grp['events']/df_grp['events'].sum()
        df_grp['woe']=np.log((df_grp['pos_per']+1e-08)/(df_grp['neg_per']+1e-08))
        df_grp_formatted = df_grp[["variable","dec","min_value","max_value","total","pop_perc","events","events_perc","non_events","non_events_perc","event_rate","woe"]]
        return df_grp_formatted
        return dict(zip(df_grp['var'],df_grp['woe']))
    k=1
    for i in cols:
        var = train[i].values
        q   = np.nanquantile(var,np.linspace(0,1,bins+1))
        c   = list(np.unique(q))
        c1  = [-np.inf]+c[1:-1]+[np.inf]
        train_cut = pd.cut(var,c1,include_lowest=True,labels=range(1,len(c1)))
        train_cut1=np.where(train_cut.isna(),"missing",train_cut)
        test_cut = pd.cut(test[i].values,c1,include_lowest=True,labels=range(1,len(c1)))
        test_cut1=np.where(test_cut.isna(),"missing",test_cut)
        tar_train = train[target_var].values
        tar_test = test[target_var].values
        df_tr = pd.DataFrame({'dec':train_cut1,'value':train[i].values,'target':tar_train})
        df_te = pd.DataFrame({'dec':test_cut1,'value':test[i].values,'target':tar_test})
        tr_a = cal_woe(df_tr,i).copy()
        te_a = cal_woe(df_te,i).copy()
        if k==1:
            train_out=tr_a.copy()
            test_out=te_a.copy()
        else:
            train_out=pd.concat([train_out,tr_a],axis=0).copy()
            test_out=pd.concat([test_out,te_a],axis=0).copy()
        k = k+1
    return train_out,test_out

In [None]:
def validation_perf(train_ks_df,scored_df,target_col,num_deciles=10):
    import pandas as pd
    import numpy as np
    Y_val=scored_df[scored_df[target_col].isin([0,1])==True][target_col]
    pred_prob_val=scored_df[scored_df[target_col].isin([0,1])==True]['pred_prob']
    train_ks_df.columns=[col+'_train' for col in train_ks_df.columns]
    pred_prob_val_df = pd.DataFrame({'target_val':Y_val,'prob_val':pred_prob_val})
    pred_prob_val_df['target0_val'] = 1 - pred_prob_val_df['target_val']
    train_ks_val=train_ks_df.merge(pred_prob_val_df,how='cross')
    train_ks_val1=train_ks_val[(train_ks_val['prob_val']>=train_ks_val['min_prob_train']) & (train_ks_val['prob_val'] <=train_ks_val['max_prob_train']) ].sort_values('Decile_train')
    train_ks_val2=train_ks_val1.groupby(['Decile_train','min_prob_train','max_prob_train']).agg(avg_prob=('prob_val','mean'),num_cust=('prob_val','count'),events=('target_val','sum'),nonevents=('target0_val','sum')).reset_index()
    train_ks_val2.sort_values('Decile_train', ascending = True, inplace = True)
    train_ks_val2['event_rate'] = (train_ks_val2.events / pred_prob_val_df['target_val'].sum()).apply('{0:.2%}'.format)
    train_ks_val2['nonevent_rate'] = (train_ks_val2.nonevents / pred_prob_val_df['target0_val'].sum()).apply('{0:.2%}'.format)
    train_ks_val2['cum_eventrate']=(train_ks_val2.events / pred_prob_val_df['target_val'].sum()).cumsum()
    train_ks_val2['cum_noneventrate']=(train_ks_val2.nonevents / pred_prob_val_df['target0_val'].sum()).cumsum()
    train_ks_val2['bad_rate']=train_ks_val2.events*100/(train_ks_val2.events+train_ks_val2.nonevents)
    train_ks_val2['KS'] = abs(np.round(train_ks_val2['cum_eventrate']-train_ks_val2['cum_noneventrate'], 5) * 100)
    train_ks_val2['population_percent'] = np.round(train_ks_val2['num_cust']/train_ks_val2['num_cust'].sum(), 5) * 100
    print("% cust in low risk ",int(num_deciles/2)," Decile :",train_ks_val2[train_ks_val2['Decile_train']<(num_deciles/2 + 1)]['num_cust'].sum()/train_ks_val2['num_cust'].sum())
    train_ks_val3 = train_ks_val2[['Decile_train','min_prob_train','max_prob_train','avg_prob','num_cust','event_rate','events','nonevents','cum_eventrate','cum_noneventrate','bad_rate','KS', 'population_percent']].sort_values('Decile_train', ascending = True)
    return train_ks_val3