In [1]:
import os, re, json, gc, itertools
import time
from datetime import datetime

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

import joblib
import xgboost as xgb
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

# preprocessing

In [2]:
# load data
train = pd.read_csv('../input/second_round_training_data.csv')
test = pd.read_csv('../input/second_round_testing_data.csv')

In [3]:
train.head(5)

Unnamed: 0,Parameter1,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9,Parameter10,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Quality_label
0,0.167898,104.443982,2.772825,0.146548,0.000421,0.000612,2286.523413,0.035407,0.593081,1.010385,0.323881,2.59782,41.506485,36.955992,8.454598,11.438066,177.24312,338.729256,2.021704,0.079526,Pass
1,252.483066,0.343232,0.066873,0.002495,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,0.004594,0.004243,0.135967,225.632949,0.48186,20597.447822,3.72333,15.37619,0.986973,4.634376,Fail
2,4.124654,0.170534,0.3838,4e-06,0.000909,0.001972,2286.523413,0.035407,0.593081,1.010385,0.031295,0.951186,0.000423,2.208138,0.073525,236.079314,0.064196,0.576302,33.87579,1.813727,Fail
3,294.65675,6.153711,0.014716,4284.326273,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,1.480634,0.000122,8.4e-05,0.654517,0.025872,176.948915,0.029777,0.246726,27.117165,0.081819,Fail
4,0.026284,0.16681,7.587398,0.002202,0.000909,0.002397,2286.523413,0.035407,0.593081,1.010385,867.342323,0.827308,0.040846,0.260989,0.00938,194.798039,0.055053,0.014725,13.569707,18.138496,Fail


In [4]:
train['Parameter9'].isnull().sum()

0

In [5]:
test.head(5)

Unnamed: 0,Group,Parameter1,Parameter10,Parameter2,Parameter3,Parameter4,Parameter5,Parameter6,Parameter7,Parameter8,Parameter9
0,0,2.167762,10.060093,0.035811,0.459938,7.4606,17.962176,18.872982,0.152059,0.035407,0.593081
1,0,0.006197,0.052625,0.047829,0.322457,0.207077,0.664442,0.314162,0.038483,17.850021,0.593081
2,0,1.179951,0.052625,0.075537,26.113194,0.304415,0.664442,0.258497,0.038483,17.850021,0.593081
3,0,0.007218,1.010385,5.081092,1.132919,931.721241,2.225808,2.208755,0.038483,2.931083,
4,0,3.355337,0.377332,0.076572,23.199547,1.172201,3.095123,2.684398,0.038483,2.931083,


In [6]:
test['Parameter9'].isnull().sum()

3000

In [7]:
# train 分离 target
target = train.pop('Quality_label')

In [8]:
target.unique()

array(['Pass', 'Fail', 'Good', 'Excellent'], dtype=object)

In [9]:
# target 编码
target = (target.astype("category")
          .cat.set_categories(["Fail", "Pass", "Good", "Excellent"])
          .cat.codes)
target = target.rename('target')

In [10]:
target.head(5)

0    1
1    0
2    0
3    0
4    0
Name: target, dtype: int8

In [11]:
# train 分离 parameter 和 attribute
col_attr = [f'Attribute{x}' for x in range(1,11)]
attr = train[col_attr]
train = train.drop(columns=col_attr)

In [12]:
# test 分离 group
group = test.pop('Group')
group = group.rename('group')

In [13]:
# test 调整参数顺序，与 train 保持一致
col_test = [f"Parameter{x}" for x in range(1, 11)]
test = test[col_test]

In [14]:
# 简化列名
colnames = [f'p{i}' for i in range(1,11)]
train.columns = colnames
test.columns = colnames
colnames = [f'a{i}' for i in range(1,11)]
attr.columns = colnames

# FE

In [15]:
# rounding
round_fmt = {1: 11, 2: 11, 3: 11, 4: 11, 5: 8,
             6: 8, 7: 8, 8: 8, 9: 8, 10: 8}
for i, r in round_fmt.items():
    train[f'p{i}'] = train[f'p{i}'].round(r)
    test[f'p{i}'] = test[f'p{i}'].round(r)

In [16]:
# transform log2
# 对数转换和归一化后存在负数，影响加减乘除的结果
# 需要测试对数转换前的效果
train = train.apply(np.log2)
attr = attr.apply(np.log2)
test = test.apply(np.log2)

In [17]:
# 保留 P5-P10
train = train.drop(columns=[f'p{i}' for i in range(1, 5)])
test = test.drop(columns=[f'p{i}' for i in range(1, 5)])

In [18]:
print(train.shape, test.shape)

(12934, 6) (6000, 6)


In [19]:
# 对 p9 进行label encoder，保留test中的缺失情况
test['p9_isnull'] = test['p9'].isna()
test_na=test[test['p9_isnull'] == False]
lbl=LabelEncoder()
lbl.fit(list(test_na['p9']) + list(train['p9']))
test_na['p9']=lbl.transform(test_na['p9'])
train['p9']=lbl.transform(train['p9'])
test.loc[test['p9_isnull'] == False, 'p9']=test_na
del test_na

# predict A4-A6 and P9

## P9

In [20]:
test['p9_isnull'] = test['p9'].isna()
test_not_na = test[test['p9_isnull']==False]
lbl = LabelEncoder()
lbl.fit(list(test_not_na['p9']) + list(train['p9']))
test_not_na['p9'] = lbl.transform(test_not_na['p9'])
train['p9'] = lbl.transform(train['p9'])
test.loc[test['p9_isnull'] == False, 'p9'] = test_not_na
del test_not_na

In [21]:
parameter='p9'

In [22]:
p9=train['p9']
train=train.drop(columns='p9')
df=train.join(p9)

In [23]:
test_not_na = test[test['p9_isnull'] == True]

In [24]:
test_not_na=test_not_na.drop(columns=['p9', 'p9_isnull'])

### LGB

In [25]:
best_hp={'boosting_type': 'gbdt',
 'num_leaves': 111,
 'max_depth': -1,
 'learning_rate': 0.1,
 'n_estimators': 639,
 'subsample_for_bin': 200000,
 'objective': 'multiclass',
 'min_split_gain': 0,
 'min_child_weight': 0.001,
 'min_child_samples': 5,
 'subsample': 1.0,
 'subsample_freq': 0,
 'colsample_bytree': 1,
 'reg_alpha': 0.20174465129035402,
 'reg_lambda': 0.18200878389736302,
 'n_jobs': -1,
 'silent': True,
 'importance_type': 'split',
 'num_class': 17}

In [26]:
m = lgb.LGBMClassifier().set_params(**best_hp)
m.fit(train,p9)

LGBMClassifier(colsample_bytree=1, min_child_samples=5, min_split_gain=0,
               n_estimators=639, num_class=17, num_leaves=111,
               objective='multiclass', reg_alpha=0.20174465129035402,
               reg_lambda=0.18200878389736302)

In [27]:
res=m.predict(test_not_na)

In [28]:
test.loc[test['p9_isnull']==True,'p9']=res

In [29]:
test['p9']=test['p9'].astype(int)
test=test.drop(columns='p9_isnull')

In [30]:
p10=train['p10']
train=train.drop(columns='p10')
train=train.join(p9)
train=train.join(p10)

In [31]:
train['p9'].min()

0

## A4

In [32]:
attribute='a4'
df = train.join(attr[attribute])

In [33]:
best_hp={'max_depth': 5,
 'learning_rate': 0.1,
 'n_estimators': 571,
 'verbosity': 1,
 'objective': 'reg:squarederror',
 'booster': 'dart',
 'n_jobs': -1,
 'gamma': 0,
 'min_child_weight': 1,
 'max_delta_step': 0,
 'subsample': 1,
 'colsample_bytree': 1,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'reg_alpha': 6.345561548486771,
 'reg_lambda': 2.808394786832976,
 'scale_pos_weight': 1,
 'base_score': 0.5,
 'importance_type': 'gain',
 'num_round': 223}


In [34]:
x_trn,x_val,y_trn,y_val = train_test_split(train, attr.a4, test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn,y_trn)

XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=571, n_jobs=-1, num_parallel_tree=1, num_round=223,
             random_state=0, reg_alpha=6.345561548486771,
             reg_lambda=2.808394786832976, scale_pos_weight=1, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=1)

In [35]:
# 保存模型
joblib.dump(m,f'../model/tp3_{attribute}.m')

['../model/tp3_a4.m']

## A5

In [36]:
attribute='a5'

In [37]:
df = train.join(attr[attribute])

In [38]:
best_hp={'max_depth': 4,
 'learning_rate': 0.1,
 'n_estimators': 749,
 'verbosity': 1,
 'objective': 'reg:squarederror',
 'booster': 'dart',
 'n_jobs': -1,
 'gamma': 0,
 'min_child_weight': 1,
 'max_delta_step': 0,
 'subsample': 1,
 'colsample_bytree': 1,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'reg_alpha': 7.819025434063891,
 'reg_lambda': 0.005996946163704,
 'scale_pos_weight': 1,
 'base_score': 0.5,
 'importance_type': 'gain',
 'num_round': 934}

In [39]:
x_trn,x_val,y_trn,y_val = train_test_split(train, attr.a5, test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn,y_trn)

XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=749, n_jobs=-1, num_parallel_tree=1, num_round=934,
             random_state=0, reg_alpha=7.819025434063891,
             reg_lambda=0.005996946163704, scale_pos_weight=1, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=1)

In [40]:
# 保存模型
joblib.dump(m,f'../model/tp3_{attribute}.m')

['../model/tp3_a5.m']

## A6 

In [41]:
attribute='a6'

In [42]:
df = train.join(attr[attribute])

In [43]:
best_hp={'max_depth': 4,
 'learning_rate': 0.1,
 'n_estimators': 731,
 'verbosity': 1,
 'objective': 'reg:squarederror',
 'booster': 'dart',
 'n_jobs': -1,
 'gamma': 0,
 'min_child_weight': 1,
 'max_delta_step': 0,
 'subsample': 1,
 'colsample_bytree': 1,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'reg_alpha': 9.299861941127418,
 'reg_lambda': 8.762447153395918,
 'scale_pos_weight': 1,
 'base_score': 0.5,
 'importance_type': 'gain',
 'num_round': 917}

In [44]:
x_trn,x_val,y_trn,y_val = train_test_split(train, attr.a6, test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn,y_trn)

XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=731, n_jobs=-1, num_parallel_tree=1, num_round=917,
             random_state=0, reg_alpha=9.299861941127418,
             reg_lambda=8.762447153395918, scale_pos_weight=1, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=1)

In [45]:
# 保存模型
joblib.dump(m, f'../model/tp3_{attribute}.m')

['../model/tp3_a6.m']

# attribute stacking

In [46]:
# stacking 建立 attr 特征
cv = KFold(6, shuffle=True, random_state=1)
attrs = [f'a{i}' for i in range(4, 7)]
trn_res = None
tst_res = None
for attribute in attrs:
    print(attribute)
    m = joblib.load(f'../model/tp3_{attribute}.m')
    trn_attr = None
    tst_attr = None
    for index, (trn_idx, val_idx) in enumerate(cv.split(train, attr[attribute])):
        x_trn, x_val = train.iloc[trn_idx], train.iloc[val_idx]
        y_trn, y_val = attr[attribute].iloc[trn_idx], attr[attribute].iloc[val_idx]
        m.fit(x_trn, y_trn)
        if trn_attr is None:
            trn_attr = np.c_[val_idx, m.predict(x_val)]
            tst_attr = m.predict(test)/cv.n_splits
        else:
            trn_attr = np.r_[trn_attr, np.c_[val_idx, m.predict(x_val)]]
            tst_attr += m.predict(test)/cv.n_splits
    trn_attr = pd.DataFrame(trn_attr).set_index(0)
    trn_attr.index = trn_attr.index.astype('int')
    trn_attr.columns = [attribute]
    if trn_res is None:
        trn_res = trn_attr
        tst_res = tst_attr
    else:
        trn_res = trn_res.join(trn_attr)
        tst_res = np.c_[tst_res,tst_attr]

a4
a5
a6


In [47]:
train = train.join(trn_res)
tst_res = pd.DataFrame(tst_res)
tst_res.columns = [f'a{i}' for i in range(4,7)]
test = test.join(tst_res)

In [48]:
# save data
save_path = '../data/tp6.h5'
train.to_hdf(save_path,'train')
target.to_hdf(save_path,'target')
test.to_hdf(save_path,'test')
attr.to_hdf(save_path,'attr')
group.to_hdf(save_path,'group')

# predict Quality_label

In [49]:
# load data
load_path = '../data/tp6.h5'
train = pd.read_hdf(load_path,'train')
target = pd.read_hdf(load_path,'target')
test = pd.read_hdf(load_path,'test')
attr = pd.read_hdf(load_path,'attr')
group = pd.read_hdf(load_path,'group')

## FE

In [50]:
def create_uid(df,cols,uid_name):
    df[uid_name]=df[cols[0]].astype(str)
    if len(cols)>1:
        for c in cols[1:]:
            df[uid_name]=df[uid_name]+df[c].astype(str)
    return df

def uid_aggregation(train,test,main_cols,uid,agg_method):
    for main_col in main_cols:
        new_col_name=uid+'_'+main_col+'_'+agg_method
        train[new_col_name]=train.groupby([uid])[main_col].transform(agg_method)
        test[new_col_name]=test.groupby([uid])[main_col].transform(agg_method)

def frequency_encoding(train,test,cols):
    for c in cols:
        new_col_name=c+'_freq'
        train[new_col_name] = train[c].map(train[c].value_counts())
        test[new_col_name] = test[c].map(test[c].value_counts())

def nunique_encoding(train,test,main_cols,col):
    for main_col in main_cols:
        new_col_name=main_col+'_'+col+'_nunique'
        train[new_col_name]= train[col].map(train.groupby(col)[main_col].nunique())
        test[new_col_name]= test[col].map(test.groupby(col)[main_col].nunique())

In [51]:
for c in range(4,7):
    col='a'+str(c)
    train[col]=train[col].round(3)
    test[col]=test[col].round(3)

# creat uids
for key,value in {'uid1':['p5','p6'],'uid2':['p7','p9','p10'],'uid3':['p5','p6','p7','p8','p9'],'uid4':['p10', 'p9', 'p8', 'p7']}.items():
    train=create_uid(train,cols=value,uid_name=key)
    test=create_uid(test,cols=value,uid_name=key)
    
# apply uid_aggregation
uid_aggregation(train,test,main_cols=['a4','a5'],uid='p7',agg_method='mean')
uid_aggregation(train,test,main_cols=['a4','a5','a6'],uid='uid4',agg_method='mean')
uid_aggregation(train,test,main_cols=['a4'],uid='uid4',agg_method='std')
uid_aggregation(train,test,main_cols=['p5','p6'],uid='uid4',agg_method='sum')

# apply frequency_encoding
frequency_encoding(train,test,cols=['p7','p8','p10'])

# apply unique_encoding
nunique_encoding(train,test,main_cols=['p8','p9','p10'],col='p7')
nunique_encoding(train,test,main_cols=['p9'],col='p8')
nunique_encoding(train,test,main_cols=['uid2','uid3'],col='uid1')
nunique_encoding(train,test,main_cols=['uid3'],col='uid2')

# a4+a5+a6
train['a4_plus_a5_plus_a6']=train['a4']+train['a5']+train['a6']
test['a4_plus_a5_plus_a6']=test['a4']+test['a5']+test['a6']

# majic_fe
train['majic_fe']=train['uid4_p5_sum']*train['uid4_p6_sum']
test['majic_fe']=test['uid4_p5_sum']*test['uid4_p6_sum']

# drop_cols
drop_cols=['uid1','uid2','uid3','uid4','uid4_p5_sum','uid4_p6_sum']
train =train.drop(columns=drop_cols)
test =test.drop(columns=drop_cols)

In [52]:
best_hp={'max_depth': 3,
 'learning_rate': 0.1,
 'n_estimators': 203,
 'verbosity': 1,
 'objective': 'multi:softmax',
 'booster': 'dart',
 'n_jobs': -1,
 'gamma': 0,
 'min_child_weight': 1,
 'max_delta_step': 0,
 'subsample': 1,
 'colsample_bytree': 1,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'reg_alpha': 6.736871781492226,
 'reg_lambda': 0.073512560432491,
 'scale_pos_weight': 1,
 'base_score': 0.5,
 'num_class': 4,
 'num_round': 295}

In [53]:
oof = np.zeros((train.shape[0], 4))
prediction = np.zeros((test.shape[0], 4))
seeds = [123456, 87654, 676767]
num_model_seed = 3
for model_seed in range(num_model_seed):
    print("开始训练第%d个模型：" % (model_seed + 1))
    oof_cat = np.zeros((train.shape[0], 4))
    prediction_cat = np.zeros((test.shape[0], 4))
    skf = StratifiedKFold(n_splits=5, random_state=seeds[model_seed], shuffle=True)
    for index, (train_index, test_index) in enumerate(skf.split(train, target)):
        print("正在训练第%d折" % (index + 1))
        train_x, test_x, train_y, test_y = train.iloc[train_index], train.iloc[test_index], target.iloc[train_index], \
                                           target.iloc[test_index]
        gc.collect()
        m = xgb.XGBClassifier().set_params(**best_hp).set_params(n_jobs=-1,learning_rate=0.05)
        m.fit(train_x, train_y) # xgb需要接受的是字典
        oof_cat[test_index] += m.predict_proba(test_x)
        prediction_cat += m.predict_proba(test) / skf.n_splits
        gc.collect()
    oof += oof_cat / num_model_seed
    prediction += prediction_cat / num_model_seed
    print('logloss', log_loss(pd.get_dummies(target).values, oof_cat))
    print('ac', accuracy_score(target, np.argmax(oof_cat, axis=1)))
print("*" * 30)
print('mean_logloss', log_loss(pd.get_dummies(target).values, oof))
print('mean_ac', accuracy_score(target, np.argmax(oof, axis=1)))

group_size=50
mname='CCF_model_xgb'
res_prob = pd.DataFrame(np.c_[group,prediction])
res_prob.columns=['Group', 'Fail ratio', 'Pass ratio', 'Good ratio', 'Excellent ratio']
res_prob = res_prob[['Group', 'Excellent ratio','Good ratio', 'Pass ratio','Fail ratio']]
res_prob =(res_prob.groupby('Group').sum()/group_size).reset_index()
res_prob.to_csv(f'../sub/{mname}_prob.csv',index=False,encoding='utf-8')

开始训练第1个模型：
正在训练第1折
正在训练第2折
正在训练第3折
正在训练第4折
正在训练第5折
logloss 1.0500998759789058
ac 0.5496366166692439
开始训练第2个模型：
正在训练第1折
正在训练第2折
正在训练第3折
正在训练第4折
正在训练第5折
logloss 1.0490238778394991
ac 0.5495593010669553
开始训练第3个模型：
正在训练第1折
正在训练第2折
正在训练第3折
正在训练第4折
正在训练第5折
logloss 1.0490068792627605
ac 0.551646822328746
******************************
mean_logloss 1.0475657105802867
mean_ac 0.5514921911241688


In [54]:
len(train.columns)

27

In [55]:
train.columns

Index(['p5', 'p6', 'p7', 'p8', 'p9', 'p10', 'a4', 'a5', 'a6', 'p7_a4_mean',
       'p7_a5_mean', 'uid4_a4_mean', 'uid4_a5_mean', 'uid4_a6_mean',
       'uid4_a4_std', 'p7_freq', 'p8_freq', 'p10_freq', 'p8_p7_nunique',
       'p9_p7_nunique', 'p10_p7_nunique', 'p9_p8_nunique', 'uid2_uid1_nunique',
       'uid3_uid1_nunique', 'uid3_uid2_nunique', 'a4_plus_a5_plus_a6',
       'majic_fe'],
      dtype='object')