In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.feature_selection import mutual_info_classif
from category_encoders import MEstimateEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import warnings


# 在全局范围内禁止输出 SettingWithCopyWarning 警告
warnings.filterwarnings('ignore')

In [52]:
def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
            df[col] = df[col].astype('int32')  # 使用更小的整数类型
        elif col in ["date_decision"]:
            df[col] = pd.to_datetime(df[col])  # 转换为datetime类型
        elif col[-1] in ("P", "A"):
            df[col] = df[col].astype('float32')  # 使用更小的浮点数类型
        elif col[-1] in ("M",):
            df[col] = df[col].astype('category')  # 使用压缩编码的category类型
        elif df[col].dtype == bool:
            df[col] = df[col].astype('int32')  # 使用更小的整数类型
        elif col[-1] in ("D",):
            df[col] = pd.to_datetime(df[col])  # 转换为datetime类型
       
        # 将所有的float64类型转换为float32类型
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        # 将所有的int64类型转换为int32类型
        elif df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
        gc.collect()

def handle_dates(df):
    df = pd.merge(df, base[['case_id', 'date_decision']], on='case_id', how='left')
    df = pd.merge(df, person1[['case_id', 'birth_259']], on='case_id', how='left')
    
    for col in df.columns:
        if col in ['date_decision', 'birth_259']:
            df[col] = pd.to_datetime(df[col])
           
        if col.endswith("D"):
            # Calculate the difference in days for "date_decision" - col
            new_col_name_1 = col + '_date_diff'
            df[new_col_name_1] = (df["date_decision"] - df[col]).dt.days
            
            # Calculate the difference in days for col - "birth_259"
            new_col_name_2 = col + '_birth_diff'
            df[new_col_name_2] = (df[col] - df["birth_259"]).dt.days

    # Drop the original date columns and original columns ending with "D"
    cols_to_drop = [col for col in df.columns if col.endswith("D")]
    cols_to_drop += ['date_decision', 'birth_259']
    df = df.drop(columns=cols_to_drop)
    
    return df
#outliers
def outliers(df, q=0.0001):
    for col in df.columns:
         if df[col].dtype == 'float32' or df[col].dtype == 'float64':
            lower_bound = df[col].quantile(q)
            upper_bound = df[col].quantile(1 - q)
            df[col]=df[col].clip(lower_bound, upper_bound)
    return df
    

In [54]:
def transforms(df, df_name):
    # 记录要删除的原始列
    cols_to_drop = []
    print('coming...')
    for col in df.columns:
        grouped_series = df.groupby('case_id')[col]
        print('got it！')
        if col not in ["target", "case_id", "WEEK_NUM", 'num_group1', 'num_group2']:
            if col.endswith(("P", "A")):
                df[f"{col}_max"] = grouped_series.transform('max')
                df[f"{col}_median"] = grouped_series.transform('median')
                df[f"{col}_mean"] = grouped_series.transform('mean')
                df[f"{col}_first"] = grouped_series.transform('first')
                df[f"{col}_last"] = grouped_series.transform('last')
                cols_to_drop.append(col)  # 添加要删除的原始列
            elif col.endswith("D"):
                df[f"{col}_max"] = grouped_series.transform('max')
                df[f"{col}_mean"] = grouped_series.transform('mean')
                df[f"{col}_last"] = grouped_series.transform('last')
                cols_to_drop.append(col)  # 添加要删除的原始列
            elif col.endswith("M"):
                df[f"{col}_n"] = grouped_series.transform('nunique')
                df[f"{col}_last"] = grouped_series.transform('last')
                cols_to_drop.append(col)  # 添加要删除的原始列
            elif col.endswith(("T", "L")) and df[col].dtype != 'object' and df[col].dtype != 'category':
                df[f"{col}_max"] = grouped_series.transform('max')
                df[f"{col}_mean"] = grouped_series.transform('mean')
                df[f"{col}_first"] = grouped_series.transform('first')
                df[f"{col}_last"] = grouped_series.transform('last')
                cols_to_drop.append(col)
            elif col.endswith(("T", "L")) and (df[col].dtype == 'object' or df[col].dtype != 'category'):
                df[f"{col}_n"] = grouped_series.transform('nunique')
                df[f"{col}_first"] = grouped_series.transform('first')
                df[f"{col}_last"] = grouped_series.transform('last')
                cols_to_drop.append(col)
        elif 'num_group' in col:
            grouped_series = df.groupby('case_id')[col]
            df[f"{col}_{df_name}_max"] = grouped_series.transform('max')
            df[f"{col}_{df_name}_count"] = grouped_series.transform('count')
            
            cols_to_drop.append(col)  # 添加要删除的原始列

    # 删除要删除的原始列
    df=df.drop(columns=cols_to_drop)
    gc.collect()
    return df

In [109]:
def missing(df):
    for col in df.columns:
        if col not in ["target", "case_id", "WEEK_NUM"]:
            isnull = df[col].isnull().mean()
            if isnull >= 0.7:
                df=df.drop(col,axis=1)
    return df

def sparse(df):
    for col in df.columns:
        if (col not in ["target", "case_id", "WEEK_NUM"]) and (df[col].dtype == 'object' or df[col].dtype == 'category'):
            freq = df[col].nunique()
            if (freq == 1) or (freq >= 500):
                df = df.drop(col, axis=1)
            else:  # 处理低频值
                infrequent_mask = df[col].isin(df[col].value_counts()[df[col].value_counts() < 0.001 * len(df)].index)
                df.loc[infrequent_mask, col] = np.nan
                
                # 计算最常见值的比例
                most_common_value_ratio = df[col].value_counts(normalize=True).max()
                
                # 如果最常见值的比例超过98%，则删除该列
                if most_common_value_ratio >= 0.98:
                    df = df.drop(col, axis=1)
    return df

In [140]:
def read_file(paths, name):
    if isinstance(paths, str):# If only one file path is provided
        data = pd.read_parquet(paths)
        set_table_dtypes(data)
        data=sparse(data)
        data=outliers(data)
        data = handle_dates(data)
        if any('num_group' in col for col in data.columns): # Check if 'num_group' column exists
            print('action! select one file num_group')
            data=transforms(data,name)
            data=data.drop_duplicates(subset=['case_id'])
            print('missing')
            data=sparse(data)
            data=missing(data)
            print('MI Scores...')
            data=miscores_index_all(data,base)
            #data=reduce_usage(data)
            print('finished')
            gc.collect()
        else:
            print('action! select one file non-num_group')
            data=data.drop_duplicates(subset=['case_id']) 
            print('missing')
            data=sparse(data)
            data=missing(data)
            print('MI Scores...')
            data=miscores_index_all(data,base)
            #data=reduce_usage(data)
            print('finished')
            gc.collect()
        return data

    elif isinstance(paths, list):  # If multiple file paths are provided
        data_frames = []
        for path in paths:
            data = pd.read_parquet(path)
            set_table_dtypes(data)
            data=sparse(data)
            data=outliers(data)
            data = handle_dates(data)
            if any('num_group' in col for col in data.columns):  # Check if 'num_group' column exists
                print('action!select multi files num_group')
                data=transforms(data,name)
                data=data.drop_duplicates(subset=['case_id'])
                print('action!append')
                data_frames.append(data)
                del data
                print('finished')
                gc.collect()
            else:
                print('action! select multi files non-num_group')
                data=data.drop_duplicates(subset=['case_id'])
                data_frames.append(data)
                del data
                print('finished')
                gc.collect()
        together = pd.concat(data_frames)  # Concatenate all data frames at once
        del data_frames
        print('missing')
        together=sparse(together)
        together=missing(together)
        print('MI Scores...')
        together=miscores_index_all(together,base)
        return together

In [111]:
def reduce_usage(df):

    for col in df.columns:
        
        col_type = df[col].dtype
        
        # Skip columns of type 'category'
        if col_type == "category":
            continue
        
        # Skip columns of type 'datetime'
        if col_type == "datetime64[ns]":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
            
    return df

In [112]:
def remove_corr(df, threshold=0.8):
    # 筛选出数值列
    nums = df.select_dtypes(exclude=['category', 'object']).columns
    cols_to_skip = ["target", "case_id", "WEEK_NUM"]
    
    # 如果没有数值列，则直接返回原始 DataFrame
    if len(nums) == 0:
        return df
    
    # 选取数值列构建 DataFrame
    df2 = df[nums]
    
    # 计算相关性矩阵
    corr_matrix = df2.corr().abs()
    
    cols_to_remove = set(corr_matrix.columns[corr_matrix.isnull().all()])
    
    # 删除相关系数矩阵中全是 NaN 值的行和列
    corr_matrix = corr_matrix.dropna(axis=0, how='all').dropna(axis=1, how='all')
    

    # 生成上三角矩阵
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    for col in upper_triangle.columns:
        if col in cols_to_skip:
            continue
        # 计算相关列
        correlated_cols = upper_triangle.index[upper_triangle[col] > threshold].tolist()
        for correlated_col in correlated_cols:
            # 对比缺失值数量
            if df[correlated_col].isnull().sum() <= df[col].isnull().sum():
                cols_to_remove.add(col)
            else:
                cols_to_remove.add(correlated_col)
    
    # 构建保留列的列表
    cols_to_keep = [col for col in df.columns if col not in cols_to_remove]
    
    # 返回删除相关列后的 DataFrame
    df = df[cols_to_keep]
    
    return df

In [113]:
#MI Scores
def mi_scores(X,y):
    for col in X.columns:
        if X[col].dtype in ['category','object']:
            X[col],_=X[col].factorize()
    discrete_features=X.dtypes==int
    mi_scores=mutual_info_classif(X,y,discrete_features=discrete_features)
    mi_scores=pd.DataFrame({'MI Scores':mi_scores},index=X.columns)
    mi_scores=mi_scores.sort_values(by='MI Scores',ascending=False)
    return mi_scores
def miscores_index_all(df,base):
    test=df.dropna()
    test=pd.merge(test,base[['case_id','target']],on='case_id',how='left')
    X=test.sample(frac=0.5)
    y=X.pop('target')
    keep_cols=mi_scores(X,y).query('`MI Scores`>0.0001').index.tolist()
    if 'case_id' not in keep_cols:
        keep_cols.append('case_id')
    df=df[keep_cols]
    return df

In [114]:
cols=[]

In [115]:
#选它
base=pd.read_parquet('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_base.parquet')

In [116]:
set_table_dtypes(base)

In [117]:
base['month_decision']=base['date_decision'].dt.month
base['weekday_decision']=base['date_decision'].dt.weekday
base['day_decision']=base['date_decision'].dt.day
base['day_of_year'] = base['date_decision'].dt.dayofyear

In [118]:
cols.append(base.columns)

In [119]:
#选它
person1=pd.read_parquet('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_person_1.parquet')
set_table_dtypes(person1)

In [120]:
person1=person1[person1['num_group1']==0]

In [121]:
person1=missing(person1)
person1=sparse(person1)

In [122]:
person1=outliers(person1)

In [79]:
#没有用
def person1_date(df):
    df = pd.merge(df, base[['case_id', 'date_decision']], on='case_id', how='left') 
    for col in df.columns:
        if col in ['date_decision']:
            df[col] = pd.to_datetime(df[col])

        if col.endswith("D"):
            # Calculate the difference in days for "date_decision" - col
            new_col_name_1 = col + '_date_diff'
            df[new_col_name_1] = (df["date_decision"] - df[col]).dt.days

    # Drop the original date columns and original columns ending with "D"
    cols_to_drop = [col for col in df.columns if col.endswith("D")]
    cols_to_drop += ['date_decision',]
    df = df.drop(columns=cols_to_drop)
    return df

In [123]:
person1['incometype_1044T']=person1['incometype_1044T'].replace(['HANDICAPPED_2','HANDICAPPED_3'],'HANDICAPPED')

In [126]:
person1['safeguarantyflag_411L']=person1['safeguarantyflag_411L'].astype('float32')

In [127]:
person1.head()

Unnamed: 0,case_id,birth_259D,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,empladdr_district_926M,familystate_447L,incometype_1044T,language1_981M,mainoccupationinc_384A,num_group1,personindex_1023L,persontype_1072L,persontype_792L,role_1084L,safeguarantyflag_411L,sex_738L
0,0,1986-07-01,P97_36_170,2017-09-15,MORE_FIVE,OTHER,P142_57_166,MARRIED,SALARIED_GOVT,P10_39_147,10800.0,0,0.0,1.0,1.0,CL,1.0,F
4,1,1957-08-01,P97_36_170,2008-10-29,MORE_FIVE,OTHER,,DIVORCED,SALARIED_GOVT,P10_39_147,10000.0,0,0.0,1.0,1.0,CL,1.0,M
9,2,1974-12-01,P97_36_170,2010-02-15,MORE_FIVE,OTHER,,MARRIED,EMPLOYED,P10_39_147,14000.0,0,0.0,1.0,1.0,EM,1.0,F
14,3,1993-08-01,P33_146_175,2018-05-15,MORE_FIVE,OTHER,P131_33_167,MARRIED,EMPLOYED,P10_39_147,10000.0,0,0.0,1.0,1.0,CL,1.0,F
17,4,1994-01-01,P33_146_175,2014-12-15,MORE_FIVE,OTHER,P62_144_102,MARRIED,EMPLOYED,P10_39_147,24000.0,0,0.0,1.0,1.0,CL,1.0,F


In [None]:
person1=miscores_index_all(person1,base)

In [128]:
person1=person1.rename(columns={'birth_259D':'birth_259'})

In [129]:
person1.head()

Unnamed: 0,case_id,birth_259,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,empladdr_district_926M,familystate_447L,incometype_1044T,language1_981M,mainoccupationinc_384A,num_group1,personindex_1023L,persontype_1072L,persontype_792L,role_1084L,safeguarantyflag_411L,sex_738L
0,0,1986-07-01,P97_36_170,2017-09-15,MORE_FIVE,OTHER,P142_57_166,MARRIED,SALARIED_GOVT,P10_39_147,10800.0,0,0.0,1.0,1.0,CL,1.0,F
4,1,1957-08-01,P97_36_170,2008-10-29,MORE_FIVE,OTHER,,DIVORCED,SALARIED_GOVT,P10_39_147,10000.0,0,0.0,1.0,1.0,CL,1.0,M
9,2,1974-12-01,P97_36_170,2010-02-15,MORE_FIVE,OTHER,,MARRIED,EMPLOYED,P10_39_147,14000.0,0,0.0,1.0,1.0,EM,1.0,F
14,3,1993-08-01,P33_146_175,2018-05-15,MORE_FIVE,OTHER,P131_33_167,MARRIED,EMPLOYED,P10_39_147,10000.0,0,0.0,1.0,1.0,CL,1.0,F
17,4,1994-01-01,P33_146_175,2014-12-15,MORE_FIVE,OTHER,P62_144_102,MARRIED,EMPLOYED,P10_39_147,24000.0,0,0.0,1.0,1.0,CL,1.0,F


In [130]:
cols.append(person1.columns)

In [None]:
#person1第二种方式

In [None]:
person1=read_file('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_person_1.parquet','person1')

In [94]:
person1.shape

(1526659, 20)

In [None]:
cols.append(person1.columns)

In [131]:
bureaub21=read_file('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_b_1.parquet','bureaub21')

action! select one file num_group
coming...
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
missing
finished


In [132]:
bureaub21.shape

(36500, 163)

In [133]:
cols.append(bureaub21.columns)

In [134]:
bureaub22=read_file('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_b_2.parquet','bureaub22')

action! select one file num_group
coming...
got it！
got it！
got it！
got it！
got it！
got it！
got it！
missing
finished


In [135]:
bureaub22.shape

(36447, 17)

In [136]:
bureaub22.head()

Unnamed: 0,case_id,pmts_date_1107D_date_diff,pmts_date_1107D_birth_diff,num_group1_bureaub22_max,num_group1_bureaub22_count,num_group2_bureaub22_max,num_group2_bureaub22_count,pmts_dpdvalue_108P_max,pmts_dpdvalue_108P_median,pmts_dpdvalue_108P_mean,pmts_dpdvalue_108P_first,pmts_dpdvalue_108P_last,pmts_pmtsoverdue_635A_max,pmts_pmtsoverdue_635A_median,pmts_pmtsoverdue_635A_mean,pmts_pmtsoverdue_635A_first,pmts_pmtsoverdue_635A_last
0,467,54,23878,2,30,26,30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,1445,1097,15994,4,83,35,83,200418.0,0.0,10363.111328,0.0,0.0,7.2,0.0,0.898765,0.0,0.0
113,1934,128,23452,2,79,36,79,342432.0,262886.0,260334.859375,249912.0,275859.0,526.799988,312.299988,312.421631,94.800003,526.799988
192,3159,76,23025,0,3,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
195,3208,564,22294,0,15,14,15,1341.0,0.0,89.400002,0.0,0.0,5.2,0.0,0.346667,0.0,0.0


In [137]:
cols.append(bureaub22.columns)

In [138]:
person2=read_file('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_person_2.parquet','person2')

action! select one file num_group
coming...
got it！
got it！
got it！
got it！
got it！
got it！
got it！
got it！
missing
finished


In [139]:
person2.shape

(1435105, 8)

In [108]:
person2.head()

Unnamed: 0,case_id,addres_role_871L_n,conts_role_79M_n,conts_role_79M_last,empls_economicalst_849M_n,empls_economicalst_849M_last,num_group1_person2_max,num_group1_person2_count,num_group2_person2_max,num_group2_person2_count,relatedpersons_role_762T_n
0,5,0,1,a55475b1,1,a55475b1,0,1,0,1,0
1,6,2,3,a55475b1,2,a55475b1,1,8,5,8,1
9,7,0,1,a55475b1,1,a55475b1,0,1,0,1,0
10,8,0,1,a55475b1,1,a55475b1,0,1,0,1,0
11,9,0,1,a55475b1,1,a55475b1,0,1,0,1,0


In [None]:
cols.append(person2.columns)

In [None]:
applprev2=read_file('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_applprev_2.parquet','applprev2')

In [None]:
applprev2.shape

In [None]:
cols.append(applprev2.columns)

In [None]:
debitcard=read_file('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_debitcard_1.parquet','debitcard')

In [None]:
debitcard.shape

In [None]:
debitcard.head()

In [None]:
cols.append(debitcard.columns)

In [None]:
deposit=read_file('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_deposit_1.parquet','deposit')

In [None]:
deposit.shape

In [None]:
cols.append(deposit.columns)

In [None]:
file_paths = ['/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_static_0_0.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_static_0_1.parquet',
             ]
static=read_file(file_paths,'static')

In [None]:
static.shape

In [None]:
cols.append(static.columns)

In [None]:
stativcv0=read_file('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_static_cb_0.parquet','stativcv0')

In [None]:
stativcv0.shape

In [None]:
stativcv0.columns

In [None]:
cols.append(stativcv0.columns)

In [None]:
file_paths = ['/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_applprev_1_0.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_applprev_1_1.parquet',
             ]
applprev1=read_file(file_paths,'applprev1')

In [None]:
applprev1.shape


In [None]:
cols.append(applprev1.columns)

In [None]:
file_paths = ['/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_0.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_1.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_2.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_3.parquet',
             ]
bureau=read_file(file_paths,'bureau')

In [None]:
# bureau定制
file_paths = ['/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_0.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_1.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_2.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_3.parquet',
             ]

columns = ['case_id','refreshdate_3813885D',]
# 定义空 DataFrame 存储最终结果
bureau = pd.DataFrame()

# 逐个处理数据集并合并
for file_path in file_paths:
    # 读取数据集
    data = pd.read_parquet(file_path,columns=columns)
    # 对加载的数据进行一系列处理
    set_table_dtypes(data)
    data=handle_dates(data)
    data=outliers(data)
    data=transforms(data,'bureau')
    data.drop_duplicates(subset=['case_id'],inplace=True)
    gc.collect()
    #miss_sparse(data)
    
    # 将处理后的数据追加到最终结果中
    bureau = pd.concat([bureau, data])
    del data
    gc.collect()

In [None]:
bureau.shape

In [None]:
bureau.head()

In [None]:
cols.append(bureau.columns)

In [None]:
file_paths = ['/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_0.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_1.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_10.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_2.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_3.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_4.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_5.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_6.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_7.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_8.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_9.parquet'
              
             ]
bureau2=read_file(file_paths,'bureau2')

In [None]:
bureau2.shape

In [None]:
cols.append(bureau2.columns)

In [None]:
# 定义文件路径列表
file_paths = ['/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_0.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_1.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_10.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_2.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_3.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_4.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_5.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_6.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_7.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_8.parquet',
              '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_2_9.parquet'
              
             ]
columns = ['case_id','pmts_month_158T',]
# 定义空 DataFrame 存储最终结果
bureau2 = pd.DataFrame()

# 逐个处理数据集并合并
for file_path in file_paths:
    # 读取数据集
    data = pd.read_parquet(file_path,columns=columns)
    # 对加载的数据进行一系列处理
    set_table_dtypes(data)
    data=handle_dates(data)
    data=outliers(data)
    data=transforms(data,'bureau2')  # 为所有数据集设置相同的标识符
    data.drop_duplicates(subset=['case_id'],inplace=True)
    
    # 将处理后的数据追加到最终结果中
    bureau2 = pd.concat([bureau2, data])
    del data
    gc.collect()

In [None]:
bureau2.shape

In [None]:
bureau2.head()

In [None]:
cols.append(bureau2.columns)

In [None]:
#选它
taxa=pd.read_parquet('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_tax_registry_a_1.parquet')
taxb=pd.read_parquet('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_tax_registry_b_1.parquet')
taxc=pd.read_parquet('/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_tax_registry_c_1.parquet')

In [None]:
taxa.columns=['case_id','tax_amount','tax_name','num_group1','tax_dateD']
taxb.columns=['case_id','tax_amount','tax_dateD','tax_name','num_group1']
taxc.columns=['case_id','tax_name','num_group1','tax_amount','tax_dateD']

In [None]:
#选它
tax=pd.concat([taxc,taxa,taxb],axis=0)
del taxa,taxb,taxc

In [None]:
set_table_dtypes(tax)

In [None]:
tax=outliers(tax)

In [None]:

tax=handle_dates(tax)

In [None]:
def transform_cols(df, group,df_name):
    num_cols = df.select_dtypes(include=['int', 'float']).columns
    obj_cols = df.select_dtypes(include=['object', 'category']).columns

    # 处理数值型特征
    for col in num_cols:
        if col not in ["target", "case_id", "WEEK_NUM", 'num_group1', 'num_group2']:
            grouped_series = df.groupby(group)[col]
            df[f"{col}_max"] = grouped_series.transform('max')
            df[f"{col}_median"] = grouped_series.transform('median')
            df[f"{col}_mean"] = grouped_series.transform('mean')
            df[f"{col}_first"] = grouped_series.transform('first')
            df[f"{col}_last"] = grouped_series.transform('last')
            df.drop(columns=[col], inplace=True)  # 删除原始列

    # 处理非数值型特征
    for col in obj_cols:
        if col not in ["target", "case_id", "WEEK_NUM", 'num_group1', 'num_group2']:
            grouped_series = df.groupby(group)[col]
            df[f"{col}_count"] = grouped_series.transform('count')
            df[f"{col}_n"] = grouped_series.transform('nunique')
            df[f"{col}_first"] = grouped_series.transform('first')
            df[f"{col}_last"] = grouped_series.transform('last')
            df.drop(columns=[col], inplace=True)  # 删除原始列

    # 处理包含'num_group'的特征列
    for col in df.columns:
        if 'num_group' in col:
            grouped_series = df.groupby(group)[col]
            df[f"{col}_{df_name}_max"] = grouped_series.transform('max')
            df[f"{col}_{df_name}_count"] = grouped_series.transform('count')
            df.drop(columns=[col],inplace=True)

In [None]:
transform_cols(tax,'case_id','tax')

In [None]:
tax.drop_duplicates(subset=['case_id'],keep='first',inplace=True)

In [None]:
tax.shape

In [None]:
cols.append(tax.columns)

In [None]:
merged_list = [item for sublist in cols for item in sublist]

In [None]:
cols=list(set(merged_list))

In [None]:
len(cols)

In [None]:
del all1

In [None]:
all1=pd.merge(base,person1,on='case_id',how='left')

In [None]:

all1=pd.merge(all1,applprev1,on='case_id',how='left')
all1=pd.merge(all1,static,on='case_id',how='left')

In [None]:
all1=pd.merge(all1,stativcv0,on='case_id',how='left')
all1=pd.merge(all1,tax,on='case_id',how='left')

In [None]:
all1=pd.merge(all1,bureau,on='case_id',how='left')

In [None]:
all1=pd.merge(all1,bureau2,on='case_id',how='left')

In [None]:
all1=pd.merge(all1,bureaub21,on='case_id',how='left')
all1=pd.merge(all1,bureaub22,on='case_id',how='left')


In [None]:
all1=pd.merge(all1,deposit,on='case_id',how='left')
all1=pd.merge(all1,applprev2,on='case_id',how='left')


In [None]:
all1=pd.merge(all1,person2,on='case_id',how='left')
all1=pd.merge(all1,debitcard,on='case_id',how='left')

In [None]:
all1.shape

In [None]:
#目标编码
col=['empladdr_district_926M','incometype_1044T','education_927M','lastst_736L','lastrejectreason_759M','lastrejectreasonclient_4145040M','credtype_322L']
X_encode=all1.sample(frac=0.2,random_state=0)
y_encode=X_encode.pop('target')
X_pretrain=all1.drop(X_encode.index)
y_pretrain=X_pretrain.pop('target')
encoder=MEstimateEncoder(cols=col,m=500)
encoder.fit(X_encode[col],y_encode)
X_pretrain_encoded=encoder.transform(X_pretrain[col])
X_pretrain_encoded.columns = [f'{col}_encoded' for col in X_pretrain_encoded.columns] 
all1 = pd.concat([X_pretrain, X_pretrain_encoded,pd.DataFrame(y_pretrain, columns=['target'])], axis=1)

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip columns of type 'category'
        if col_type == "category":
            continue
        
        # Skip columns of type 'datetime'
        if col_type == "datetime64[ns]":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
            
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
 all1=reduce_mem_usage(all1)

In [3]:
all1.shape

(1526659, 274)

In [2]:
all1=pd.read_csv('/kaggle/input/credic/274-less -newnew.csv')

In [None]:
all1=remove_corr(all1)

In [6]:
all1.shape

(1526659, 368)

In [None]:
all1.select_dtypes(include=['object', 'category']).columns

In [4]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from category_encoders import TargetEncoder


onehot_cols = []
ordinal_cols = []
target_cols = []

onehot_encoders = {}
#ordinal_encoders = {}
target_encoders = {}

for col in all1.select_dtypes(include=['object', 'category']).columns:
    unique_count = all1[col].nunique()

    if unique_count < 5:
        onehot_cols.append(col)  # 记录列名
        encoder = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore', dtype=np.int8)
        transformed_data = encoder.fit_transform(all1[[col]])
        all1 = all1.drop(columns=[col])  # 删除原特征列
        all1 = pd.concat([all1, pd.DataFrame(transformed_data, columns=encoder.get_feature_names_out([col]))], axis=1)
        onehot_encoders[col] = encoder

    elif 5 <= unique_count < 50:
        ordinal_cols.append(col)  # 记录列名
        encoder = OrdinalEncoder(handle_unknown='ignore', dtype=np.float32)
        all1[col] = encoder.fit_transform(all1[[col]])

    else:
        target_cols.append(col)  # 记录列名
        encoder = TargetEncoder(smoothing=100)
        all1[col] = encoder.fit_transform(all1[col], all1['target'])
        target_encoders[col] = encoder
print("\nOneHot Encoded Columns:")
print(onehot_cols)
print("\nOrdinal Encoded Columns:")
print(ordinal_cols)
print("\nTarget Encoded Columns:")
print(target_cols)


OneHot Encoded Columns:
['education_927M_first', 'education_927M_last', 'familystate_447L_first', 'familystate_447L_last', 'incometype_1044T_first', 'incometype_1044T_last', 'language1_981M_first', 'language1_981M_last', 'role_1084L_first', 'role_1084L_last', 'sex_738L_first', 'sex_738L_last', 'type_25L_first', 'type_25L_last', 'cancelreason_3545846M_first', 'cancelreason_3545846M_last', 'credtype_587L_first', 'credtype_587L_last', 'education_1138M_first', 'education_1138M_last', 'familystate_726L_first', 'familystate_726L_last', 'inittransactioncode_279L_first', 'inittransactioncode_279L_last', 'isbidproduct_390L_first', 'isbidproduct_390L_last', 'postype_4733339M_first', 'postype_4733339M_last', 'rejectreason_755M_first', 'rejectreason_755M_last', 'rejectreasonclient_4145042M_first', 'rejectreasonclient_4145042M_last', 'status_219L_first', 'status_219L_last', 'credtype_322L', 'disbursementtype_67L', 'inittransactioncode_186L', 'lastapprcommoditycat_1041M', 'lastcancelreason_561M', '

In [5]:
import joblib
joblib.dump(onehot_encoders, 'onehot_encoders.pkl')
#joblib.dump(ordinal_encoders, 'ordinal_encoders.pkl')
joblib.dump(target_encoders, 'target_encoders.pkl')

['target_encoders.pkl']

In [None]:
(all1.isnull().sum(axis=1)>20).sum()

In [None]:
all1.to_csv('274-less.csv',index=False)

In [None]:
all1.columns.tolist()

In [9]:
all1.select_dtypes(['object','category']).columns

Index([], dtype='object')

In [None]:
for col in all1.columns:
    if all1[col].dtype in ['category','object']:
        all1[col],_=all1[col].factorize()
#all1=change32(all1)

In [10]:
all1.shape

(1526659, 368)

In [None]:
#diy一个评分标准
from sklearn.metrics import make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
def stability_metric(y_true,y_pred):
    gini_in_time=[]
    unique_weeks=np.unique(X_test['WEEK_NUM'])
    for week_num in unique_weeks:
        week_indices=np.where(X_test['WEEK_NUM']==week_num)[0]
        y_true_week=y_true.iloc[week_indices]
        y_pred_week=y_pred.iloc[week_indices]
        if (y_true_week == 0).all():
            continue
        gini = 2 * roc_auc_score(y_true_week, y_pred_week) - 1
        gini_in_time.append(gini)
   
    #fit a linear regression
    x=np.arange(len(gini_in_time))
    y=gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    #final
    stability=avg_gini+88*min(0,a)-0.5*res_std
    return stability
stability_scorer=make_scorer(stability_metric,greater_is_better=True)

In [None]:
import wandb
wandb.init(project='credit',
           name='',
           #config=params,\
           notes='')

In [None]:
category_cols=['incometype_1044T_last', 'incometype_1044T_first', 'sex_738L_last',
       'sex_738L_first', 'empladdr_district_926M_first',
       'education_927M_first', 'language1_981M_first', 'language1_981M_last',
       'role_1084L_first', 'type_25L_last', 'role_1084L_last',
       'empladdr_zipcode_114M_last', 'education_927M_last',
       'empladdr_district_926M_last', 'type_25L_first', 'status_219L_last',
       'cancelreason_3545846M_last', 'rejectreason_755M_last',
       'rejectreasonclient_4145042M_last', 'status_219L_first',
       'cancelreason_3545846M_first', 'rejectreason_755M_first',
       'rejectreasonclient_4145042M_first', 'education_1138M_first',
       'education_1138M_last', 'inittransactioncode_279L_last',
       'credtype_587L_last', 'familystate_726L_first', 'familystate_726L_last',
       'postype_4733339M_last', 'inittransactioncode_279L_first',
       'credtype_587L_first', 'postype_4733339M_first',
       'lastrejectreason_759M', 'lastst_736L', 'lastcancelreason_561M',
       'lastrejectreasonclient_4145040M', 'previouscontdistrict_112M',
       'lastapprcommoditycat_1041M', 'lastrejectcommoditycat_161M',
       'lastapprcommoditytypec_5251766M', 'opencred_647L',
       'lastrejectcommodtypec_5251769M', 'credtype_322L',
       'inittransactioncode_186L', 'disbursementtype_67L',
       'twobodfilling_608L', 'education_1103M', 'description_5085714M',
       'maritalst_385M', 'maritalst_893M', 'education_88M', 'tax_name_first',
       'tax_name_last', 'contractst_516M_first', 'credor_3940957M_first',
       'contractst_516M_last', 'periodicityofpmts_997M_first',
       'purposeofcred_722M_first', 'credor_3940957M_last',
       'classificationofcontr_1114M_first', 'contracttype_653M_first',
       'subjectrole_326M_last', 'pmtmethod_731M_first', 'pmtmethod_731M_last',
       'subjectrole_43M_first', 'subjectrole_326M_first',
       'classificationofcontr_1114M_last', 'purposeofcred_722M_last',
       'contracttype_653M_last', 'subjectrole_43M_last',
       'conts_type_509L_last', 'conts_type_509L_first',
       'cacccardblochreas_147M_first', 'addres_district_368M_first',
       'addres_district_368M_last', 'empls_economicalst_849M_first',
       'conts_role_79M_last']

In [None]:
df_train=all1.iloc[:100000]
df_valid=all1.iloc[700000:1000000]
#df_test=all1.iloc[800000:900000]

y_train=df_train['target']
X_train=df_train.drop(columns=['target','case_id','date_decision'])
y_valid=df_valid['target']
X_valid=df_valid.drop(columns=['target','case_id','date_decision'])
#y_test=df_test['target']
#X_test=df_test.drop(columns=['target','case_id','date_decision'])

X_train[category_cols] = X_train[category_cols].astype('category')
X_valid[category_cols] = X_valid[category_cols].astype('category')
#X_test[category_cols] = X_test[category_cols].astype('category')

In [6]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 5,  
    "learning_rate": 0.05,
    'subsample_freq':9,
    'subsample':0.65,
    #0.8144028756010205
    "num_boost_round":3000, 
    "colsample_bytree": 0.625,
    "colsample_bynode": 0.875,
    "random_state": 42,
    "reg_alpha": 7.35,
    "reg_lambda": 6.55,
    "extra_trees":True,
    'num_leaves':55, 
    "verbose": -1,
    'early_stopping_round':100,
    'min_child_samples':4928,
    #'is_unbalalce':True,
    'scale_pos_weight':30,
    'device':'cpu',
    #'min_child_weight':0.06703326435611615,
    #'min_split_gain':0.21805530921304175,
    #后面是dart的参数
    #'drop_rate':0.1,
    #'skip_drop':0.5,
    #'uniform_drop':False,
    #'xgboost_dart_mode':False,
   
}



In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.metrics import make_scorer,roc_auc_score

In [None]:
#optuna超参数调优
from optuna.samplers import RandomSampler
import optuna
def objective(trial):
    params={
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "auc",
        "max_depth": trial.suggest_int("max_depth", 5,10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01,0.05,step=0.005),
        "subsample_freq": trial.suggest_int("subsample_freq", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5,0.8,step=0.01),
        "num_boost_round": 3500,
        #'num_boost_round': trial.suggest_int('num_boost_roung',2000,4500),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.9,step=0.005),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 0.9,step=0.005),
        "random_state": 42,
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 20,step=0.01),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 35,step=0.01),
        "extra_trees": True,
        "num_leaves": trial.suggest_int("num_leaves", 30, 60),
        "verbose": -1,
        "min_child_samples": trial.suggest_int("min_child_samples", 1000, 8000),
        'scale_pos_weight':30,
        #'is_unbalalce':True,
        #"min_child_weight": trial.suggest_float("min_child_weight", 0.1, 0.15),
        #'min_split_gain':trial.suggest_float('min_split_gain', 0.1, 0.5),
        'device':'cpu'
    }
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=category_cols)
    model=lgb.LGBMClassifier(**params)
    model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],callbacks=[lgb.log_evaluation(200), lgb.early_stopping(100)],
              )
    y_pred = model.predict_proba(X_test)[:, 1]
    y_pred=pd.Series(y_pred,index=y_test.index)
    score=stability_metric(y_test,y_pred)
    wandb.log({'score': score,'auc':model.evals_result_['valid_0']['auc'][-1]})
    return score
pruner=optuna.pruners.MedianPruner(
    n_startup_trials=5, n_warmup_steps=500, interval_steps=1)
study=optuna.create_study(direction='maximize',pruner=pruner)
study.optimize(objective,n_trials=400,)
print('bset trial:')
best_trial=study.best_trial
print('Value:{}'.format(best_trial.value))
print('Params:')
for key,value in best_trial.params.items():
    print('{}:{}'.format(key,value))

In [None]:
df_train=all1.iloc[900000:]
#df_train[category_cols] = df_train[category_cols].astype('category')

In [None]:
for col in category_cols:
    df_train[col] = df_train[col].cat.add_categories('unknown')
df_train[category_cols] = df_train[category_cols].fillna('unknown')

In [11]:
#分层分组抽样
y= all1["target"]
weeks = all1["WEEK_NUM"]
df_train= all1.drop(columns=["target", "case_id", "WEEK_NUM",
                             #'date_decision'
                            ])
cv = StratifiedGroupKFold(n_splits=10, shuffle=False)

fitted_models_cat = []
fitted_models_lgb = []

cv_scores_cat = []
cv_scores_lgb = []

stability_results_cat = []
stability_results_lgb = []

feature_importance_lgb = []

fold = 0
for idx_train, idx_valid in cv.split(df_train, y,groups=weeks):
    
    df_res_cat = pd.DataFrame()
    df_res_lgb = pd.DataFrame()
    
    X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid= df_train.iloc[idx_valid], y.iloc[idx_valid]
    
    #df_res_cat['WEEK_NUM'] = list(week_valid)
    df_res_cat['target'] = list(y_valid)
    df_res_cat['fold'] = fold
    
    
    #df_res_lgb['WEEK_NUM'] = list(week_valid)
    df_res_lgb['target'] = list(y_valid)
    df_res_lgb['fold'] = fold
    
    fold += 1
    
    
    train_pool = Pool(X_train, y_train,
                      #cat_features=category_cols
                     )
    val_pool = Pool(X_valid, y_valid,
                    #cat_features=category_cols
                   )
    
    clf = CatBoostClassifier(eval_metric='AUC', task_type='CPU', learning_rate=0.03, iterations=1000,
                             class_weights=[1,30]
                            )
    
    random_seed=3107
    
    clf.fit(train_pool, eval_set=val_pool,verbose=100)
    fitted_models_cat.append(clf)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)
    
    df_res_cat['score'] = list(y_pred_valid)
    
    
    train_data = lgb.Dataset(X_train, label=y_train,)
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,y_train,
        #categorical_feature=category_cols,
        eval_set = [(X_valid, y_valid)],
        #callbacks=[wandb_callback()]
        callbacks=[lgb.log_evaluation(200), lgb.early_stopping(100)] 
    )
    
    fitted_models_lgb.append(model)
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    #auc指标
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)
    #特征重要性
    fold_importance = model.feature_importances_
    feature_importance_lgb.append(fold_importance)
    
    
    df_res_lgb['score'] = list(y_pred_valid)
    
    #stability_results_cat.append(df_res_cat)
    stability_results_lgb.append(df_res_lgb)

#计算特征重要性
avg_feature_importance_lgb = np.mean(feature_importance_lgb, axis=0)
sorted_idx = np.argsort(avg_feature_importance_lgb)[::-1]
print('Feature importance ranking')
for i, idx in enumerate(sorted_idx):
    print(f"{i + 1}. Feature '{df_train.columns[idx]}' - Importance: {avg_feature_importance_lgb[idx]}")

    
    
print("Catboost CV AUC scores: ", cv_scores_cat)
print("Maximum Catboost CV AUC score: ", max(cv_scores_cat))


print("Lightgbm CV AUC scores: ", cv_scores_lgb)
print("Maximum Lightgbm CV AUC score: ", max(cv_scores_lgb))

0:	test: 0.8259318	best: 0.8259318 (0)	total: 1.07s	remaining: 17m 45s
100:	test: 0.8825763	best: 0.8825763 (100)	total: 1m 39s	remaining: 14m 43s
200:	test: 0.8918033	best: 0.8918033 (200)	total: 3m 20s	remaining: 13m 15s
300:	test: 0.8958618	best: 0.8958618 (300)	total: 4m 55s	remaining: 11m 25s
400:	test: 0.8984672	best: 0.8984672 (400)	total: 6m 32s	remaining: 9m 46s
500:	test: 0.9007896	best: 0.9007896 (500)	total: 8m 10s	remaining: 8m 8s
600:	test: 0.9026180	best: 0.9026180 (600)	total: 9m 47s	remaining: 6m 29s
700:	test: 0.9039527	best: 0.9039527 (700)	total: 11m 22s	remaining: 4m 51s
800:	test: 0.9047105	best: 0.9047105 (800)	total: 12m 55s	remaining: 3m 12s
900:	test: 0.9054245	best: 0.9054245 (900)	total: 14m 26s	remaining: 1m 35s
999:	test: 0.9059423	best: 0.9059423 (997)	total: 15m 58s	remaining: 0us

bestTest = 0.9059423304
bestIteration = 997

Shrink model to first 998 iterations.
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.893763


In [12]:
import joblib
joblib.dump(fitted_models_lgb,'model62-lgb.pkl')
joblib.dump(fitted_models_cat,'model62-cat.pkl')

['model62-cat.pkl']

In [None]:
all1.shape

In [None]:
all1.dtypes