In [12]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.stats import ks_2samp
import pandas as pd
import numpy as np

class SingleModel(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.1):
        self.threshold = threshold

    def model(self, X, y, ft):
        lr = LogisticRegression()
        lr.fit(X, y)
        yhat = lr.predict(X)
        prob_hat = lr.predict_proba(X)[:, 1]
        return lr, yhat, prob_hat

    def fit(self, X, y):
        self.feats = X.columns.tolist()
        self.cat_feats = X.select_dtypes(include=['object']).columns.tolist()
        self.num_feats = list(set(self.feats) - set(self.cat_feats))        

        t_values = []
        for ft in self.feats:
            _, _, prob_hat = self.model(X[[ft]], y, ft)
            values = {'variable': [ft]}
            values.update(self.calculate_values(y, prob_hat))
            t_values.append(pd.DataFrame(values))

        self.t_values = pd.concat(t_values, axis=0)
        self.selected_features = self.t_values[self.t_values['value'] >= self.threshold].variable
        return self

    def transform(self, X):
       return X[self.selected_features]
        
    def get_feature_names_in(self):
        return self.feats

    def get_feature_names_out(self):
        return self.selected_features.tolist()

class GiniSelector(SingleModel):
    def calculate_values(self, y, prob_hat):        
        gini = roc_auc_score(y, prob_hat) * 2 - 1
        return {'gini': [gini], 'value': [gini]}

select t1.*, t2.default_flag 
into ##abc
from ews..ews_khcn_portfolio_score_store t1
left join IFRS9_PROVISIONAL..FIRST_TO_DEFAULT t2
on t1.process_date = t2.PROCESS_DATE
and t1.customer_id = t2.CUSTOMER_ID
where t1.model = 'B-score'
and t1.process_date between '2020-01-31' and '2024-02-29'
and t1.portfolio = 'secured'

In [13]:
from sqlalchemy import create_engine, text
from urllib.parse import quote
def get_data_from_sql_server2():
    server = 'VM-DC-JUMPSRV77\IFRS9'
    database = 'EWS'
    user = 'rdm_admin'
    pw = '2024#tpb'
    driver = '{SQL Server}'
    cnxn_str = f"DRIVER={driver};SERVER={server};DATABASE={database};UID={user};PWD={pw}"
    engine = create_engine(f"mssql+pyodbc:///?odbc_connect={quote(cnxn_str)}")
    # query
    query = "select * from ##abc"
    df = pd.read_sql(query, engine)
    return df
df = get_data_from_sql_server2()
target_column = 'default_flag'
quantitative_columns = [
      'dpd_l6m'
      ,'dpd_l3m'
      ,'min_ca_bal_l6m'
      ,'avg_total_num_txn_l6m'
      ,'avg_total_num_txn'
      ,'avg_total_num_txn_l3m'
      ,'avg_cr_txn_amt_vs_os_l6m'
      ,'avg_txn_amt_os_onbs_l3m'
      ,'vol_os_l6m'
      ,'remain_term_autoloan'
      ,'max_os_onbs_l6m'
      ,'avg_bal_onbs_l6m'
      ,'remain_term'
      ,'total_bal_avg'
      ,'ca_bal_os_onbs_avg'
      ,'total_num_txn_l6m'
      ,'avg_bal_l3m'
      ,'avg_bal_l6m'
      ,'ca_bal_os_onbs_l3m_avg'
      ,'bal_l6m'
      ,'os_amt_long_curr'
      ,'total_num_txn_l3m'
      ,'max_net_flow_txn_amt_l3m'
      ,'ca_bal_l3m'
      ,'max_dpd_loan'
      ,'total_num_txn'
      ,'os_revl_l3m'
      ,'os_revl'
      ,'ca_bal'
      ,'min_day_to_matdt'
      ,'num_cr_product_l3m'
      ,'ca_td_bal_legacy'
     ]

In [14]:
def f_impute_values(missing_df):
    # Dictionary with feature names as keys and impute values as values
    missing_df = missing_df.copy()
    
    impute_dict = {
        "avg_bal_l3m": -652, 
        "avg_bal_onbs_l6m": -494, 
        "avg_cr_txn_amt_vs_os_l6m": -1243033176, 
        "avg_total_num_txn_l3m": -4089467647,
        "avg_total_num_txn_l6m": -17148608116, 
        "dpd_l3m": -8174489344, 
        "dpd_l6m": -1533.934551, 
        "max_dpd_loan": -766.8302552, 
        "max_os_onbs_l6m": -1.999999823,
        "min_day_to_matdt": -116.5666667, 
        "num_cr_product_l3m": -2.58409E+15, 
        "os_amt_long_curr": -56.90701563, 
        "os_revl": -305, 
        "os_revl_l3m": -75873315018, 
        "remain_term": -16.95155584, 
        "remain_term_autoloan": -3376, 
        "total_num_txn_l6m": -57175100683, 
        "vol_os_l6m": -41778658078, 
        "avg_bal_l6m": -11.55096707,
        "avg_total_num_txn": -2.19432E+11, 
        "avg_txn_amt_os_onbs_l3m": -24722594949, 
        "bal_l6m": -2551, 
        "ca_bal": -18447803048, 
        "ca_bal_l3m": -5924312015, 
        "ca_bal_os_onbs_avg": -88, 
        "ca_bal_os_onbs_l3m_avg": -664, 
        "ca_td_bal_legacy": -1.63506E+15,
        "max_net_flow_txn_amt_l3m": -7.08974E+14, 
        "min_ca_bal_l6m": -17770563461, 
        "total_bal_avg": -3468, 
        "total_num_txn": -9, 
        "total_num_txn_l3m": -10171618444
    }

    # Impute missing values in missing_df based on the impute_dict
    missing_df.fillna(value=impute_dict, inplace=True)

    # Select the desired columns in the specified order
    missing_df = missing_df[list(impute_dict.keys())]

    return missing_df

In [17]:
def calculate_gini_by_process_date(df, target_column, quantitative_columns, threshold=0.01):
    result = []

    for process_date, group_df in df.groupby('process_date'):
        # Drop groups with insufficient data
        if len(group_df[target_column].unique()) < 2:
            continue

        group_df = group_df.dropna(subset=[target_column])
        X = group_df[quantitative_columns]
        y = group_df[target_column]
        X = f_impute_values(X)

        obj = GiniSelector(threshold)
        try:
            selected_features = obj.fit_transform(X, y)
            t_values = obj.t_values
            t_values['process_date'] = process_date
            result.append(t_values[['process_date', 'variable', 'gini']])
        except Exception as e:
            print(f"Error for process_date {process_date}: {str(e)}")

    result_df = pd.concat(result, ignore_index=True)
    return result_df
# Calculate Gini coefficients for each variable by process_date
gini_results = calculate_gini_by_process_date(df, target_column, quantitative_columns)
print(gini_results)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stab

Error for process_date 2023-03-31: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0
Error for process_date 2023-04-30: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0
Error for process_date 2023-05-31: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0
Error for process_date 2023-06-30: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0
Error for process_date 2023-07-31: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0
Error for process_date 2023-08-31: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0
Error for process_date 2023-09-30: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0
Error for process_date 2023-10-31:

In [18]:
gini_results

Unnamed: 0,process_date,variable,gini
0,2020-12-31,avg_bal_l3m,0.564353
1,2020-12-31,avg_bal_onbs_l6m,0.553746
2,2020-12-31,avg_cr_txn_amt_vs_os_l6m,0.177014
3,2020-12-31,avg_total_num_txn_l3m,0.345571
4,2020-12-31,avg_total_num_txn_l6m,0.333022
...,...,...,...
859,2023-02-28,max_net_flow_txn_amt_l3m,0.000000
860,2023-02-28,min_ca_bal_l6m,-0.261859
861,2023-02-28,total_bal_avg,0.444414
862,2023-02-28,total_num_txn,-0.109286


In [19]:
gini_results.to_excel('Gini biến oot_v2.xlsx', index= False)