## Feature LGB

In [2]:
import numpy as np
import pandas as pd
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import gc
import lightgbm as lgb
import os
import warnings
import torch
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error, r2_score


warnings.filterwarnings("ignore")

In [3]:
def get_feature(df):

    X, Y = df.iloc[:,:-6], df.iloc[:, -6:]
    X = X.fillna(-1)

    S = X*np.log(X)
    S.fillna(0, inplace=True)
    s = S.sum(axis = 1).to_frame(name = 'mixing entropy')

    poly = PolynomialFeatures(include_bias=False, interaction_only =True)
    H = poly.fit_transform(X)
    new_col_names = poly.get_feature_names(X.columns.tolist())[12:]

    h = pd.DataFrame(data = H[:, 12:], columns = new_col_names, index= X.index)
    phs = h.multiply(s.values, axis=0).add_prefix('phs_')
    final_df = pd.concat([X, s, h, phs, Y], axis=1)

    return final_df


In [4]:
train = pd.read_csv('missing_data/mis_train.csv')
test = pd.read_csv('missing_data/mis_test.csv')

df_train = get_feature(train)
df_test = get_feature(test)

In [5]:

df_train

Unnamed: 0,element 1,element 2,element 3,element 4,element 5,element 6,element 7,element 8,element 9,element 10,...,phs_element 9 element 12,phs_element 10 element 11,phs_element 10 element 12,phs_element 11 element 12,property 1,property 2,property 3,property 4,property 5,property 6
0,0.673956,0.174380,0.018852,0.122537,0.009426,0.000000,0.000424,0.000283,0.000141,0.000000,...,-0.000000e+00,-0.000000,-0.000000,-0.000000e+00,0.317895,0.589088,0.752391,0.328302,-0.206129,0.442830
1,0.010422,0.000000,0.009474,0.312648,0.001421,0.661298,0.000000,0.000000,0.000000,0.000000,...,-0.000000e+00,-0.000000,-0.000000,-0.000000e+00,-0.108256,-0.574822,0.009474,0.009474,0.022768,-0.682320
2,-1.000000,0.004798,0.016314,0.009596,0.004798,0.002879,0.000192,-1.000000,0.000048,0.000000,...,8.729698e-06,-0.000000,0.000000,0.000000e+00,-0.092087,1.220121,0.834930,1.319043,-0.018376,0.442830
3,0.002357,0.000000,-1.000000,0.009428,-1.000000,0.867352,0.000283,0.000000,0.000754,0.113133,...,-1.669355e-08,-0.000351,-0.000003,-1.460686e-07,,,-0.568540,-0.930921,0.023050,
4,0.004180,0.000000,-1.000000,0.010450,0.000052,0.856934,0.015676,-1.000000,-1.000000,0.104504,...,2.827052e-05,-0.000414,-0.000003,-2.068070e-07,,-0.833978,,,0.242608,0.217800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1367,0.009466,-1.000000,-1.000000,0.482772,-1.000000,-1.000000,0.000189,0.000947,0.000189,0.000000,...,7.677186e-05,-0.000000,0.000000,0.000000e+00,0.386099,-0.501252,-0.243823,-0.904183,0.123940,-0.524799
1368,0.443867,0.250337,-1.000000,0.259965,0.006740,0.019257,0.000289,0.000193,0.000096,0.000000,...,1.128542e-04,0.000000,0.000000,-1.172104e+00,0.542143,0.166928,0.654849,-0.361795,-1.115771,1.943030
1369,-1.000000,0.000000,0.007925,0.000000,0.019812,0.000000,0.000248,-1.000000,0.000248,0.000000,...,-0.000000e+00,-0.000000,-0.000000,-0.000000e+00,-0.108256,1.237477,0.836668,1.384191,0.026685,
1370,0.023177,0.000000,-1.000000,0.000000,0.000000,0.845002,0.003380,0.000000,0.000000,0.123612,...,-0.000000e+00,0.062698,-0.000000,0.000000e+00,-0.108256,,-0.034687,-0.929547,,0.442830


In [None]:

def cal_score(true, pred):
    mae = mean_absolute_error(true, pred)
    r2 = r2_score(true, pred)
    rmse = np.sqrt(((true - pred) ** 2).mean())
    pccs = pearsonr(true, pred)[0]
    mae, r2, rmse, pccs = np.around([mae, r2, rmse, pccs], decimals=4)

    return mae, r2, rmse, pccs

def eva_all(true, train, val, true_test, test):
    print('train')
    mae, r2, rmse, pccs = cal_score(true, train)
    print(f'mae:{mae}, rmse:{rmse}, r2:{r2}, pccs:{pccs}')
    print('valid')
    mae, r2, rmse, pccs = cal_score(true, val)
    print(f'mae:{mae}, rmse:{rmse}, r2:{r2}, pccs:{pccs}')
    print('test')
    mae, r2, rmse, pccs = cal_score(true_test, test)
    print(f'mae:{mae}, rmse:{rmse}, r2:{r2}, pccs:{pccs}')

In [39]:
def train_property(df_train, df_test, property_i):

  df_1 = df_train[~df_train[property_i].isnull()]
  X = df_1.iloc[:,:-6]
  temp_Y = df_1[property_i]

  print(X.shape, temp_Y.shape)

  df_test = df_test[~df_test[property_i].isnull()]
  X_test = df_test.iloc[:,:-6]
  Y_test = df_test[property_i]

  print(X_test.shape, Y_test.shape)

  use_fea = X.columns.tolist()


  params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression',  # 目标函数
    'metric': {'l1', 'l2',},  # 评估函数
    'num_leaves': 7,  # 叶子节点数
    'learning_rate': 0.05,  # 学习速率
    'max_depth': 3,
    'feature_fraction': 0.8,  # 建树的特征选择比例
    'bagging_fraction': 0.8,  # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': -1,
    'num_boost_round': 100,
    'early_stopping_rounds': 20,
      # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
  }

  imps = []

  imp = pd.DataFrame()
  imp['feat'] = use_fea
  kf = KFold(n_splits=3, random_state=2022, shuffle=True)
  oof_real_train = np.zeros(X.shape[0])
  oof_train = np.zeros(X.shape[0])
  pred = np.zeros(X_test.shape[0])
  
  for now_fold, (trn_idx, val_idx) in enumerate(kf.split(X, temp_Y)):
      print(f"Fold: {now_fold+1}")
      X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
      y_train, y_val = temp_Y.iloc[trn_idx], temp_Y.iloc[val_idx]
      
      lgb_train = lgb.Dataset(X_train, label=y_train)
      lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train) 
      
      gbm = lgb.train(params, lgb_train,  valid_sets=(lgb_train, lgb_val), verbose_eval=200)

      oof_train[val_idx] = gbm.predict(X_val)
      oof_real_train[trn_idx] += gbm.predict(X_train) / 2
      pred += gbm.predict(X_test) / 3
      imp['_gain_' + str(now_fold + 1)] = gbm.feature_importance(importance_type='gain')
      imp['_split_' + str(now_fold + 1)] = gbm.feature_importance(importance_type='split')
      fea = property_i.replace(' ', '_')
      gbm.save_model(f'p2_final_lgb_models_3fold/{fea}_fold{now_fold+1}_model.txt')
      del gbm, lgb_train, lgb_val; gc.collect()
      imps.append(imp)
  
  eva_all(temp_Y, oof_real_train, oof_train, Y_test, pred)

 

In [40]:
train_property(df_train, df_test, 'property 4')

(1135, 145) (1135,)
(419, 145) (419,)
Fold: 1
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 0.148685	training's l2: 0.0615856	valid_1's l1: 0.185788	valid_1's l2: 0.0860838
Fold: 2
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 0.138698	training's l2: 0.0591267	valid_1's l1: 0.176151	valid_1's l2: 0.0841437
Fold: 3
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	training's l1: 0.141633	training's l2: 0.0549393	valid_1's l1: 0.179213	valid_1's l2: 0.094474
train
mae:0.1399, rmse:0.2391, r2:0.9331, pccs:0.967
valid
mae:0.1804, rmse:0.297, r2:0.8968, pccs:0.9477
test
mae:0.1763, rmse:0.2838, r2:0.9062, pccs:0.9539
