In [155]:
import sklearn as sk
import numpy as np
import pandas as pd
import warnings
import time
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn import model_selection
from sklearn import cross_validation
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
warnings.simplefilter('ignore', DeprecationWarning)


class Scorecard():
    def __init__(self, max_bins=16, minimum_leaf=0.025, corr_threshold=0.9, odds_X_to_one = 100, odds_score=700, double_odds=25):        
        self.regressor=LogisticRegression() #Regression build method
        self.x = pd.DataFrame() #Input sample
        self.y = pd.DataFrame() #Targets
        self.vars = []
        self.vars_after_iv_cut = []
        self.vars_after_corr_cut = []
        self.vars_after_corr_cut_one_hot = []
        self.var_list_types = {} #Types of variables
        self.var_list_bins = {} #Binning of scorecard variables dictionary
        self.scorecard = pd.DataFrame() #Final scorecard representation
        self.iv_table = {} #information value tables for each variable
        self.gini = int #Gini of model 
        self.logit_model = [] #model object for LogisticRegression
        self.max_bins = max_bins #Regularization parameter. Maximum bins used in decision tree
        self.minimum_leaf = minimum_leaf #Regularization parameter. Mininmum size of one leaf
        self.column = ''
        self.iv_table = {} #Dictionary which contains iv table for each variable
        self.x_one_hot = pd.DataFrame() #Input sample in one-hot view
        self.corr_threshold = corr_threshold
        self.odds_X_to_one = odds_X_to_one 
        self.odds_score = odds_score
        self.double_odds = double_odds
        self.x_binned = pd.DataFrame()
        self.x_corr_matrix = []
        
   
  
    #Learn model on sample
    def fit(self,x,y,iv_threshold):        
        self.x = x
        self.y = y
        self.vars_after_iv_cut = []
        self.x = self.x.reset_index()
        self.y = self.y.reset_index()
        del self.x["index"]
        del self.y["index"]      
        self.fill_vars_cats()   
        print("Start excluding correlations on main sample")
        self.x_corr_matrix = self.x.corr()
        self.exclude_corr_factors(mode='normal')   
        print("Finish excluding correlations on main sample")
        print('Start binning columns...')
        #fill all values of var_list_bins
        for col in self.x.columns: 
            print('Binning: ',col)
            self.binning(mode_forward='binning',mode_output='normal',column_name=col)  
            #Filling IV table on current variable
            df_t = pd.DataFrame(self.binning(mode_forward='forward',mode_output='normal',column_name=col))
            df_t["y"] = self.y
            #df_t = df_t.rename(index=str, columns = {col:"x"})
            df_iv =pd.DataFrame({'count': df_t.groupby(col)['y'].count(), 
                             'bad_rate': df_t.groupby(col)['y'].mean(),
                             'total_goods': df_t.groupby(col)['y'].count() - df_t.groupby(col)['y'].sum(),
                            'total_bads': df_t.groupby(col)['y'].sum() 
                             }).reset_index()
            df_iv["cumm_bads"] = df_iv['total_bads'].cumsum()
            df_iv["cumm_goods"] = df_iv['total_goods'].cumsum()
            df_iv["cumm_total"] = df_iv['count'].cumsum()
            df_iv["per_bad"] = df_iv["total_bads"]/df_iv["cumm_bads"].max()
            df_iv["per_good"] = df_iv["total_goods"]/df_iv["cumm_goods"].max()
            df_iv["woe"] = np.log((df_iv["per_good"])/(df_iv["per_bad"]+0.000000001))
            iv = (df_iv["per_good"] - df_iv["per_bad"])*np.log((df_iv["per_good"])/(df_iv["per_bad"]+0.000000001))
            df_iv["iv"] = iv.sum()       
            self.iv_table[col] = df_iv
            if df_iv["iv"].mean()>=iv_threshold: self.vars_after_iv_cut.append(col)
            print('       IV = ', iv.sum())
        #creating sample in one-hot view
        self.x_one_hot = pd.DataFrame(self.x.index.values)       
        for col in self.vars_after_iv_cut:          
            self.x_one_hot = pd.merge(self.x_one_hot, pd.DataFrame(self.binning(mode_forward='forward',mode_output='one-hot',column_name=col)),left_index=True,right_index=True)
        del self.x_one_hot[self.x_one_hot.columns[0]]
        self.x = self.x[self.vars_after_iv_cut] 
        print('Exclude correlations on one-hot...')
        self.exclude_corr_factors(mode='one-hot')   
        print('Building regression...')
        self.regressor.fit(self.x_one_hot,self.y)
        self.scorecard_view()
        
    def predict_proba(self,x):
        self.x = x        
        self.x = x.reset_index()
        del self.x["index"]
        self.x_binned = pd.DataFrame(self.x.index.values)
        cols_to_delete = set(self.x.columns) - set(self.vars_after_iv_cut)
        for c in cols_to_delete:
            del self.x[c]
        for col in self.vars_after_iv_cut:
            self.x_binned = pd.merge(self.x_binned,pd.DataFrame(self.binning(mode_forward='forward',mode_output='one-hot',column_name = col)),left_index=True,right_index=True)
            #del x_binned[x_binned.columns[0]]
        cols_to_delete = set(self.x_binned.columns) - set(self.scorecard["Variable"])
        for c in cols_to_delete:
            del self.x_binned[c]
        self.x_binned = self.x_binned.reindex_axis(sorted(self.x_binned), axis=1) 
        return self.regressor.predict_proba(self.x_binned)[:,1]
        
    def predict_score(self,x):
        y_pred = self.predict_proba(x)
        bias = self.odds_score - self.double_odds*np.log(self.odds_X_to_one)/np.log(2)   
        odds = self.double_odds/np.log(2)         
        return bias+odds*np.log(1/y_pred-1)  
      
    
    def scorecard_view(self):
      #  print('Printing scorecard...')
        self.scorecard=[]
        cols = np.array('Intercept')
        cols = np.append(cols,np.array(self.vars_after_corr_cut_one_hot))
        vals = np.array(self.regressor.intercept_)
        vals = np.append(vals,np.array(self.regressor.coef_))
        self.scorecard = pd.DataFrame(cols)
        self.scorecard.rename(columns={0: 'Variable'},inplace=True)
        self.scorecard["Regression_coef"] = pd.DataFrame(vals)
        b = self.double_odds/np.log(2)
        a = self.odds_score - b*np.log(self.odds_X_to_one)    
        self.scorecard["Score"] = -self.scorecard["Regression_coef"]*b
        self.scorecard["Score"][0] = self.scorecard["Score"][0]+a
        self.scorecard["Score"] = round(self.scorecard["Score"],2)
        
    
    
    #Exclude correlations. Fill vars_after_corr_cut. Exclude correlated columns from x_one_hot
    def exclude_corr_factors(self,mode):
        if mode=='normal': x_corr = self.x_corr_matrix
        if mode=='one-hot': x_corr = self.x_one_hot.corr()    
        cols_drop=[]
        for i in range(0,len(x_corr.columns)):
            if x_corr.columns[i] not in cols_drop:
                for j in range(i+1,len(x_corr.columns)):        
                    if abs(x_corr.iloc[i][j])>self.corr_threshold: cols_drop.append(x_corr.iloc[j].name)
        if mode=='normal': 
            self.vars_after_corr_cut = list(set(self.x.columns) - set(cols_drop))
            self.vars_after_corr_cut.sort()
        if mode=='one-hot': 
            self.vars_after_corr_cut_one_hot = list(set(self.x_one_hot.columns) - set(cols_drop))
            self.vars_after_corr_cut_one_hot.sort()            
        print('Dropped columns:', cols_drop)
        if mode=='normal': self.x = self.x[self.vars_after_corr_cut]
        if mode=='one-hot': self.x_one_hot = self.x_one_hot[self.vars_after_corr_cut_one_hot]
            
        
    
    #Input - one variable name 
    #Output - optimal binning, builded on decision tree. Maximum number of bins = max_bins
    def split_numeric(self,column_name):  
        x_train_t = np.array(self.x[column_name][self.x[column_name].notnull()]) #Exclude nulls 
        y_train_t = np.array(self.y[self.x[column_name].notnull()])
        y_train_t = y_train_t.reshape(len(y_train_t),)
        x_train_t = x_train_t.reshape(x_train_t.shape[0], 1) #Need for DecisionTreeClassifier
        m_depth = int(np.log2(self.max_bins)) + 1 #Maximum tree depth
        bad_rate = y_train_t.mean()
        start = 1
        cv_scores = []
        cv = 3
        for i in range(start,m_depth): #Loop over all tree depth. CV on the each step
            d_tree = tree.DecisionTreeClassifier(criterion='gini', max_depth=i, min_samples_leaf=self.minimum_leaf)
            scores = cross_val_score(d_tree, x_train_t, y_train_t, cv=cv,scoring='roc_auc')   
            cv_scores.append(scores.mean())        
        best = np.argmax(cv_scores) + start #Criterion - maximum GINI on validation set        
        final_tree = tree.DecisionTreeClassifier(criterion='gini', max_depth=best, min_samples_leaf=0.025) #Build final tree
        final_tree.fit(x_train_t, y_train_t)
        #Final tree
        opt_bins = final_tree.tree_.threshold[final_tree.tree_.feature >= 0]        
        opt_bins = np.append(opt_bins,max(x_train_t)+1)#Add right border
        opt_bins = np.append(opt_bins,min(x_train_t)-1)#Add left border
        opt_bins = np.sort(opt_bins)    
        return opt_bins #Return optimal binning
    
    #Split categorial variable. Grouping variable for regularization.
    #Input = column name
    #Output : add to var_list_bins binned variable as dictionary
    def split_categorial(self,column_name):
        #One-hot encoding
        self.x[column_name] = self.x[column_name].fillna('MISSING')
        x_cat = pd.get_dummies(self.x[column_name],prefix = self.x[column_name].name)
        y_t = np.array(self.y)
        y_t = y_t.reshape(len(y_t),)
        bad_rate = y_t.mean()
        max_bins = max(self.x[column_name].nunique(),20)
        #Classification by decision tree
        m_depth = max_bins+1
        start = 1
        cv_scores = []
        cv = 3
        for i in range(start,m_depth):
            d_tree = tree.DecisionTreeClassifier(criterion='gini', max_depth=i, min_samples_leaf=self.minimum_leaf) 
            scores = cross_val_score(d_tree, x_cat, y_t, cv=cv,scoring='roc_auc') 
            cv_scores.append(scores.mean())
        #    print("Number of bins = ", i,"; GINI = ",2*scores.mean()-1)
        best = np.argmax(cv_scores) + start #Choose maximizing GINI on validation dataset
        #print("Optimal number of bins: ",best, "; GINI = ",2*max(cv_scores)-1)
        final_tree = tree.DecisionTreeClassifier(criterion='gini', max_depth=best, min_samples_leaf=0.025) #Build final tree
        final_tree.fit(x_cat, self.y)

        #Get leafes names
        x_l = final_tree.apply(x_cat)
        tmp = pd.DataFrame(self.x[column_name])
        tmp["LEAF"] = x_l

        #Make dictionary with optimal binning
        d = {}
        for leaf in tmp["LEAF"].unique():
            d[leaf]=str(self.x[column_name][tmp["LEAF"]==leaf].unique())   
        tmp["x_num"] = tmp["LEAF"].apply(lambda x: d.get(x))
        return d
   
    #Define variable category - numeric or categorial
    #Input - column name
    #Output - numeric or cat
    def check_type(self,column_name):
        from pandas.api.types import is_string_dtype
        from pandas.api.types import is_numeric_dtype   
        #delete nulls
        tmp_var = self.x[column_name][self.x[column_name].notnull()]
        #If number of uniques<=4 then type = categorial
        if tmp_var.nunique()<=4: return 'cat'
        elif is_numeric_dtype(tmp_var): return 'numeric'
        else: return 'cat'
    
    #Fill variable var_list_cats
    def fill_vars_cats(self):
        from pandas.api.types import is_string_dtype
        from pandas.api.types import is_numeric_dtype 
        for col in self.x[self.x.columns]:
            if self.check_type(col)=='numeric': self.var_list_types[col]='numeric'
            if self.check_type(col)=='cat': 
                self.var_list_types[col]='cat'
                if (self.x[col].nunique()<=4)&(is_numeric_dtype(self.x[col])): self.x[col] = self.x[col].apply(lambda x: 'cat_'+str(x))
                
    
    #Add leading zeros to names
    def zero_pad(self,x):
        if str(x)=='MISSING': return '000'
        if len(str(x))==3: return str('00'+str(x))[:-2]+': '
        if len(str(x))==4: return str('0'+str(x))[:-2]+': '
        if len(str(x))==5: str(x)[:-2]+': '
    
    #Naming for categories by rank
    def make_dict(x):        
        x_dict = x.groupby(0)["val"].min().fillna(0).sort_values().reset_index().rename(index=str, columns={0: "x"})
        x_dict['rownum'] = x_dict['val'].rank(method='first', na_option='top')
        x_dict['rownum'] = x_dict['rownum'].apply(zero_pad)
        x_dict['x_num'] = x_dict["rownum"].map(str)+x_dict["x"].map(str)
        del x_dict['val']
        del x_dict['rownum']
        return x_dict   
    
    #Binning procedure
    #Return binned sample. Has two modes - one-hot and norma;
    #Inputs 
    #      x - sample
    #      y - targets
    #      max_bins - maximum number of bins
    #      optimal_bins - for mode_output = 'normal' or 'one-hot' using as input for feed forward
    #                     for mode_forward='binning' calculating of optimal bins
    #                         mode_forward='forward' calculating outputs using optimals bins as input 
    #
    
    #Need for feed forward categorial variables
    #Take value from dictionary var_list_bins and answer if current value is in list
    #If yes - return list
    
    
    def forward_cat(self,x):
        for i in self.var_list_bins[self.column].keys():
            if str(x) in self.var_list_bins[self.column][i]:
                return str(self.var_list_bins[self.column][i]) 
    
    def binning(self,mode_output,mode_forward,column_name):
        variable_type = self.var_list_types[column_name]
        if (variable_type=='numeric')&(mode_forward=='forward'):         
            #Вспомогательная переменная, хранящая разбиения по непустым значениям
            x_bin_t = pd.cut(self.x[column_name][self.x[column_name].notnull()],bins=self.var_list_bins[column_name])    
            #Вспомогательная переменная, хранящая one-hot по непустым значениям
            x_bin = pd.get_dummies(x_bin_t,prefix=self.x[column_name].name,drop_first=True)
            #Добавляем колонку с пустыми значениями
            x_bin[self.x[column_name].name+'_ISNULL']=0
            x_null = pd.DataFrame(self.x[column_name][self.x[column_name].isnull()])
            for i in x_bin.columns:
                x_null[i]=0
            x_null[self.x[column_name].name+'_ISNULL']=1
            del x_null[self.x[column_name].name]
            #Если нет NULL то колонку с dummy is null удаляем   
            if len(self.x[column_name][self.x[column_name].isnull()])==0:
                del x_null[self.x[column_name].name+'_ISNULL']
                del x_bin[self.x[column_name].name+'_ISNULL']
            #Вспомогательная переменная, которая хранит узкий и широкий вид, включая пустые значения    
            x_pivot = pd.concat([x_bin_t,pd.DataFrame(self.x[column_name][self.x[column_name].isnull()])]).sort_index(axis=0)        
            del x_pivot[self.x[column_name].name]
            #Заполняем пустые значения MISSING
            x_pivot = x_pivot.fillna('MISSING')
            x_pivot['val'] = self.x[column_name]        
            #Добавляем категориям индекс (создается справочник)           
            x_dict = x_pivot.groupby(0)["val"].min().fillna(0).sort_values().reset_index().rename(index=str, columns={0: "x"})
            x_dict['rownum'] = x_dict['val'].rank(method='first', na_option='top')
            x_dict['rownum'] = x_dict['rownum'].apply(self.zero_pad)
            x_dict[column_name] = x_dict["rownum"].map(str)+x_dict["x"].map(str)
            del x_dict['val']
            del x_dict['rownum']
            x_d =  x_dict   
            x_pivot["rownum"] = x_pivot.index.values
            x_pivot = pd.merge(x_pivot,x_d,left_on=0,right_on="x").sort_values(by='rownum').reset_index()[column_name]
            #Джойним значения со справочником, удаляем исходные        
            if mode_output=='one-hot': return pd.concat([x_bin,x_null]).sort_index(axis=0) #Возвращаем в виде on-hot                            
            if mode_output=='normal': return x_pivot #Возвращаем в "длинном и узком" виде               
        if (variable_type=='cat')&(mode_forward=='forward'): 
            ####################INPUT CODE HERE#####################
            if mode_output=='normal': 
                self.column = column_name
                return self.x[column_name].apply(self.forward_cat)
            if mode_output=='one-hot': 
                self.column = column_name
                return pd.get_dummies(self.x[column_name].apply(self.forward_cat), drop_first=True,prefix=self.x[column_name].name)
        if (variable_type=='numeric')&(mode_forward=='binning'):
            self.var_list_bins[column_name] = self.split_numeric(column_name)
        if (variable_type=='cat')&(mode_forward=='binning'):                
            self.var_list_bins[column_name] = self.split_categorial(column_name)
            #x_bin = self.split_categorial(column_name)          
            #if mode_output=='one-hot': return pd.get_dummies(x_bin,prefix=self.x[column_name].name,drop_first=True)
            #if mode_output=='normal': return pd.DataFrame(x_bin)
    
     


In [156]:
df = pd.read_csv('./data/cc_sample.txt',sep=';',decimal=',')

In [157]:
x = df.copy()
x = x[x['BAD_12_FLAG90'].notnull()]
y = x['BAD_12_FLAG90']
x = x.drop(['BAD_12_FLAG90','SCORE_FINAL','CONTRACT_SRC_CODE'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [None]:
s = Scorecard()
s.fit(x_train,y_train,iv_threshold=0.01)
s.predict_proba(x_test)

Start excluding correlations on main sample
Dropped columns: ['CNT_TR_PUBL_UTIL_6M', 'CRD_DC_POS_HEALTHCARE_RUB001', 'LBT_ACCT_DEP_MNTH_LST_CLSR_QTY', 'LBT_ACCT_DEP_TOT_BAL_RUB_AMT', 'LBT_ACCT_TOT_BAL_PREV_RUB001', 'RATE_TR_ALL_7D_6M', 'RATE_TR_PAY_L3_6M']
Finish excluding correlations on main sample
Start binning columns...
Binning:  AVG_TERM_FACT
       IV =  0.07753316474188539
Binning:  CMPN_DM_AVAIL_NFLAG
       IV =  0.060266096687580026
Binning:  CMPN_EMAIL_AVAIL_NFLAG
       IV =  0.06827384747065326
Binning:  CMPN_TM_AVAIL_NFLAG
       IV =  0.11146630802930632
Binning:  CNT_AGR_OPEN
       IV =  0.08236774536959401
Binning:  CNT_AGR_WO_ARREAR_TO_CNT
       IV =  0.11051896074491833
Binning:  CNT_OPENED_6M
       IV =  0.10529126096673581
Binning:  CNT_OPENED_6M1Y
       IV =  0.07662821690326752
Binning:  CNT_TR_CARD_TRANS_1M
       IV =  0.08608407449679625
Binning:  CNT_TR_CASH_1M
       IV =  0.16509297779983234
Binning:  CNT_TR_CASH_3M
       IV =  0.19120537887823397
Bin