## Feature LGB

In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import gc
import lightgbm as lgb
import os
import warnings
import torch

warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("Lynn/p14_train_fe_hsp.csv")
train['istrain'] = 1
test = pd.read_csv("Lynn/p14_test_fe_hsp.csv")
test['istrain'] = 0
df = pd.concat([train, test]).reset_index(drop=True)

X_name = df.columns.tolist()[:-7]
Y_name = [f'property {i}' for i in range(1,7)]

poolx = df[X_name].copy(deep=True)
pooly = df[Y_name].copy(deep=True)

X = poolx.iloc[df[df['istrain']==1].index, :]
Y = pooly.iloc[df[df['istrain']==1].index, :]

X_test = poolx.iloc[df[df['istrain']==0].index, :].reset_index(drop=True)
Y_test = pooly.iloc[df[df['istrain']==0].index, :].reset_index(drop=True)

pred = torch.zeros((X_test.shape[0], 6))
print(X.shape, X_test.shape)


(1372, 145) (588, 145)


In [3]:
X.shape, X_test.shape

((1372, 145), (588, 145))

In [92]:
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error, r2_score

tar_fea = Y_name
use_fea = X.columns.tolist()

best_models = {}
for tar in tar_fea:
    best_models[tar] = []


params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression',  # 目标函数
    'metric': {'l1', 'l2',},  # 评估函数
    'num_leaves': 15,  # 叶子节点数
    'learning_rate': 0.04,  # 学习速率
    'max_depth': 4,
    'feature_fraction': 0.8,  # 建树的特征选择比例
    'bagging_fraction': 0.8,  # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': -1,
    'num_boost_round': 110,
    'early_stopping_rounds': 20,
      # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}

imps = []
train_results = []
val_results = []
true_labels = []
test_results = []

for target in tar_fea:
    imp = pd.DataFrame()
    imp['feat'] = use_fea
    kf = KFold(n_splits=3, random_state=2022, shuffle=True)
    oof_real_train = np.zeros(X.shape[0])
    oof_train = np.zeros(X.shape[0])
    pred = np.zeros(X_test.shape[0])
    temp_Y = Y[target]
    for now_fold, (trn_idx, val_idx) in enumerate(kf.split(X, temp_Y)):
        print(f"target: {target}, Fold: {now_fold+1}")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = temp_Y.iloc[trn_idx], temp_Y.iloc[val_idx]
    
        lgb_train = lgb.Dataset(X_train, label=y_train)
        lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train) 
    
        gbm = lgb.train(params, lgb_train,  valid_sets=(lgb_train, lgb_val), 
                         verbose_eval=200)

        oof_train[val_idx] = gbm.predict(X_val)
        oof_real_train[trn_idx] += gbm.predict(X_train) / 2
        pred += gbm.predict(X_test) / 3
        imp[target + '_gain_' + str(now_fold + 1)] = gbm.feature_importance(importance_type='gain')
        imp[target + '_split_' + str(now_fold + 1)] = gbm.feature_importance(importance_type='split')
        fea = target.replace(' ', '_')
        #gbm.save_model(f'final_lgb_models_3fold/{fea}_fold{now_fold+1}_model.txt')
        del gbm, lgb_train, lgb_val; gc.collect()
        imps.append(imp)
    val_results.append(oof_train)
    true_labels.append(temp_Y)
    test_results.append(pred)
    train_results.append(oof_real_train)
    break
    

target: property 6, Fold: 1
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[110]	training's l1: 0.368274	training's l2: 0.256819	valid_1's l1: 0.494793	valid_1's l2: 0.445679
target: property 6, Fold: 2
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[110]	training's l1: 0.373584	training's l2: 0.264347	valid_1's l1: 0.476368	valid_1's l2: 0.426989
target: property 6, Fold: 3
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[110]	training's l1: 0.377509	training's l2: 0.271243	valid_1's l1: 0.452231	valid_1's l2: 0.384498


In [11]:
print("train")


for i in range(len(train_results)):
    train_result = train_results[i]
    true_label = true_labels[i]
    mae = mean_absolute_error(true_label, train_result)
    r2 = r2_score(true_label, train_result)
    rmse = np.sqrt(((true_label - train_result) ** 2).mean())
    pccs = pearsonr(true_label, train_result)[0]
    mae, r2, rmse, pccs = np.around([mae, r2, rmse, pccs], decimals=4)
    print(f'property {i+1}: mae:{mae}, rmse:{rmse}, r2:{r2}, pccs:{pccs}')


print("valid")

for i in range(len(val_results)):
    
    val_result = val_results[i]
    true_label = true_labels[i]
    mae = mean_absolute_error(true_label, val_result)
    r2 = r2_score(true_label, val_result)
    rmse = np.sqrt(((true_label - val_result) ** 2).mean())
    pccs = pearsonr(true_label, val_result)[0]
    mae, r2, rmse, pccs = np.around([mae, r2, rmse, pccs], decimals=4)
    print(f'property {i+1}: mae:{mae}, rmse:{rmse}, r2:{r2}, pccs:{pccs}')

print("test")
for i in range(len(test_results)):
    
    test_result = test_results[i]
    true_label = Y_test.iloc[:,i]
    mae = mean_absolute_error(true_label, test_result)
    r2 = r2_score(true_label, test_result)
    rmse = np.sqrt(((true_label - test_result) ** 2).mean())
    pccs = pearsonr(true_label, test_result)[0]
    mae, r2, rmse, pccs = np.around([mae, r2, rmse, pccs], decimals=4)
    print(f'property {i+1}: mae:{mae}, rmse:{rmse}, r2:{r2}, pccs:{pccs}')

train
property 1: mae:0.166, rmse:1.5502, r2:0.2332, pccs:0.557
property 2: mae:0.004, rmse:0.0109, r2:0.9998, pccs:0.9999
property 3: mae:0.0044, rmse:0.0166, r2:0.9996, pccs:0.9998
property 4: mae:0.0042, rmse:0.0091, r2:0.9999, pccs:1.0
property 5: mae:0.1725, rmse:1.5196, r2:0.1366, pccs:0.4836
property 6: mae:0.2929, rmse:0.4147, r2:0.8165, pccs:0.9107
valid
property 1: mae:0.1774, rmse:1.544, r2:0.2394, pccs:0.5587
property 2: mae:0.0099, rmse:0.0238, r2:0.9993, pccs:0.9996
property 3: mae:0.0084, rmse:0.0307, r2:0.9986, pccs:0.9993
property 4: mae:0.0103, rmse:0.0226, r2:0.9994, pccs:0.9997
property 5: mae:0.1851, rmse:1.5779, r2:0.0691, pccs:0.2913
property 6: mae:0.4539, rmse:0.6312, r2:0.5749, pccs:0.7583
test
property 1: mae:0.0888, rmse:0.173, r2:0.7433, pccs:0.9351
property 2: mae:0.0082, rmse:0.0193, r2:0.9996, pccs:0.9998
property 3: mae:0.0083, rmse:0.0357, r2:0.9978, pccs:0.9989
property 4: mae:0.0081, rmse:0.0171, r2:0.9997, pccs:0.9998
property 5: mae:0.0844, rmse:0.