In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.isotonic import IsotonicRegression
import sklearn.linear_model as linear_model
import sklearn.ensemble as tree_model
import sklearn.svm as svm
from utils import make_dir, score, timer, kf_lgbm
pd.set_option('display.max_column',100)

In [2]:
def load(meta_dir, filename):
    return np.load(os.path.join(meta_dir, filename))

def load_val_test(name, input_dir='./stacking_files/'):
    val = load(input_dir, f'val.{name}.npy')
    test = load(input_dir, f'test.{name}.npy')
    return val, test

In [4]:
liuxin_stack_files = [
                     'test.justai_ctb.npy',
                     'test.justai_lgb.npy',
                     'test.luoling_ctb.npy',
                     'test.luoling_xgb.npy',
                     'val.justai_ctb.npy',
                     'val.justai_lgb.npy',
                     'val.luoling_ctb.npy',
                     'val.luoling_xgb.npy']
for f in liuxin_stack_files:
    npf = np.load(os.path.join('./liuxin/stack/',f))
    np.save(os.path.join('./stacking_files/',f),npf)

In [5]:
a = np.load('./stacking_files/test.neil_lgb.npy')
b = np.load('./stacking_files/val.neil_lgb.npy')
np.save('./stacking_files/test.neil_lgb_rounded.npy',np.round(a))
np.save('./stacking_files/val.neil_lgb_rounded.npy',np.round(b))

In [6]:
a = np.load('./stacking_files/test.neil_xgb.npy')
b = np.load('./stacking_files/val.neil_xgb.npy')
np.save('./stacking_files/test.neil_xgb_rounded.npy',np.round(a))
np.save('./stacking_files/val.neil_xgb_rounded.npy',np.round(b))

In [7]:
name_list = [
             'neil_lgb_rounded', 
             'neil_ctb', 
             'neil_rf',  
             'gotcha_lgb1',
             'gotcha_lgb2',
             'gotcha_lgb3',
             'gotcha_lgb4',
             'gotcha_lgb5',
             'justai_lgb', 
             'justai_ctb',
             'gotcha_lgb6', 
             'gotcha_ctb1',
             'luoling_xgb',
             'luoling_ctb',
             'neil_xgb_rounded',
             'gotcha_gbdt1',
             'neil_gbm',
            ]
val_list = []
test_list = []
for name in name_list:
    val,test = load_val_test(name)
    val_list.append(val)
    test_list.append(test)
    
X = np.stack(val_list,axis=1)
X_test = np.stack(test_list,axis=1)

In [8]:
train_df = pd.read_csv('../input/train_dataset.csv')
test_df = pd.read_csv('../input/test_dataset.csv')
y = train_df['信用分'].values

In [9]:
n_folds = 10
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=8888)
idx = y.argsort()
y_lab = np.repeat(list(range(2500)),20)
y_lab = np.asarray(sorted(list(zip(idx,y_lab))))[:,-1].astype(np.int32)

In [10]:
test_pred = np.zeros(len(X_test))
val_pred = np.zeros(len(X))
scores = []
for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(X,y_lab)):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    model = linear_model.HuberRegressor(epsilon=1.01, alpha=1e-5)
    model.fit(X_train,y_train)
    val_pred_fold = model.predict(X_valid)
    val_pred[valid_idx] = val_pred_fold
    s = score(np.round(val_pred_fold),y_valid)
    scores.append(s)
    test_pred += model.predict(X_test)/10
    print(model.coef_)
    print(np.round(s,8))

[ 0.18782236  0.08440949 -0.07499657  0.0854136   0.0571727   0.26825906
  0.00047739 -0.00870643  0.11709785  0.06996875 -0.19370151  0.03056669
 -0.21348534  0.26796086  0.19221551  0.10005298  0.03036265]
0.06423185
[ 0.14182068  0.08366123 -0.0877543   0.057904    0.05779548  0.12845944
  0.06001826 -0.00222432  0.19160917  0.05941438 -0.15257817  0.01513275
 -0.13813405  0.17231293  0.19962995  0.16969107  0.04416838]
0.06481553
[ 0.17849782  0.11273179 -0.05529195  0.05931097  0.07092916  0.21764503
  0.08112405 -0.0365708   0.11113912  0.02802364 -0.18403921  0.05120347
 -0.2266663   0.25977299  0.18839958  0.12323985  0.02138736]
0.06477355
[ 0.17836063  0.14153274 -0.05908758  0.08940667  0.07406125  0.24055521
  0.045601   -0.00446695  0.09094376  0.07834079 -0.24308247  0.01866421
 -0.21955273  0.24112503  0.18529931  0.11657618  0.02649879]
0.06353644
[ 0.15596346  0.05649074 -0.06848513  0.11661149  0.08375517  0.23676143
  0.02096904 -0.02696991  0.1590024   0.04396593 -0

In [11]:
def postprocess(pred):
    print('='*10,'postprocess', '='*10)
    new_pred = pred.copy()
    
    mask = (pred<619)
    new_pred = np.where(mask, np.round(pred+0.1), new_pred)
    num_change_value = (np.round(new_pred)!=np.round(pred))[mask].sum()
    print('%d/%d values are changed'%(num_change_value,sum(mask)))   
    
    mask = (pred>619)
    new_pred = np.where(mask, np.round(pred+0.02), new_pred)
    num_change_value = (np.round(new_pred)!=np.round(pred))[mask].sum()
    print('%d/%d values are changed'%(num_change_value,sum(mask)))   
     
    new_pred = np.round(new_pred).astype(int)

    return new_pred

In [12]:
print('未round:',score(val_pred, y))
print('round:',score(np.round(val_pred), y))
print('后处理（小于619,+0.1, 大于619, +0.02）:',score(postprocess(val_pred), y))

未round: 0.06439770406956541
round: 0.06441306812326085
2080/20343 values are changed
590/29657 values are changed
后处理（小于619,+0.1, 大于619, +0.02）: 0.06441904329990414


In [13]:
sub = pd.read_csv('../input/submit_example.csv')
sub[' score'] = np.round(postprocess(test_pred)).astype(int)

2012/20439 values are changed
593/29561 values are changed


In [14]:
sub.to_csv('cv0.644190_pb64107.csv',index=False)

In [15]:
sub.head()

Unnamed: 0,id,score
0,7171737d49b143d1b38883a39e4a5730,601
1,3af0a449d5424488912e8fb2bf4b9faa,529
2,eb2cf02e0d5c4d1294dd73e776dbb441,670
3,9c0f780ecb254670a11aa9e3f10777c5,676
4,d794eed46c1e44f785a575f18b3023a5,659
