In [1]:
import gc
from glob import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
import xgboost as xgb
import optuna
from scipy import stats

np.random.seed(2112)

In [2]:
def reduce_mem_usage(df, verbose=False):
    """
    Utility function to reduce the memory usage of pandas dataframes
    
    Parameters
    ----------
    df: pandas.Dataframe
    verbose: Boolean
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

***
## load models

In [3]:
# 0.1409
files = glob("../input/ump-artifacts/lgbm-dart/lgbm-seed*.txt")
display(files)

boosters_lgbm_dart = [lgb.Booster(model_file=fn) for fn in files]
display(boosters_lgbm_dart)

['../input/ump-artifacts/lgbm-dart/lgbm-seed2.txt',
 '../input/ump-artifacts/lgbm-dart/lgbm-seed5.txt',
 '../input/ump-artifacts/lgbm-dart/lgbm-seed3.txt',
 '../input/ump-artifacts/lgbm-dart/lgbm-seed7.txt',
 '../input/ump-artifacts/lgbm-dart/lgbm-seed11.txt']

[<lightgbm.basic.Booster at 0x7f4de3429c10>,
 <lightgbm.basic.Booster at 0x7f4e2e5cc4d0>,
 <lightgbm.basic.Booster at 0x7f4e2e5cc850>,
 <lightgbm.basic.Booster at 0x7f4de309c290>,
 <lightgbm.basic.Booster at 0x7f4de309c390>]

In [4]:
# 0.1419
files = glob("../input/ump-artifacts/lgbm-linear-dart/lgbm-seed*.txt")
display(files)

boosters_lgbm_linear_dart = [lgb.Booster(model_file=fn) for fn in files]
display(boosters_lgbm_linear_dart)

['../input/ump-artifacts/lgbm-linear-dart/lgbm-seed2.txt',
 '../input/ump-artifacts/lgbm-linear-dart/lgbm-seed19.txt',
 '../input/ump-artifacts/lgbm-linear-dart/lgbm-seed23.txt',
 '../input/ump-artifacts/lgbm-linear-dart/lgbm-seed7.txt',
 '../input/ump-artifacts/lgbm-linear-dart/lgbm-seed11.txt']

[<lightgbm.basic.Booster at 0x7f4e2e5e7ed0>,
 <lightgbm.basic.Booster at 0x7f4de306bd90>,
 <lightgbm.basic.Booster at 0x7f4de3429c50>,
 <lightgbm.basic.Booster at 0x7f4e2e5e7d90>,
 <lightgbm.basic.Booster at 0x7f4e2e5e7850>]

In [5]:
# 0.1390
files = glob("../input/ump-artifacts/xgboost-gbrt/xgb-seed*.json")
display(files)

boosters_xgb_gbrt = list()
for fn in files:
    _model = xgb.Booster()
    _model.load_model(fn)
    boosters_xgb_gbrt.append(_model)
display(boosters_xgb_gbrt)

['../input/ump-artifacts/xgboost-gbrt/xgb-seed11.json',
 '../input/ump-artifacts/xgboost-gbrt/xgb-seed2.json',
 '../input/ump-artifacts/xgboost-gbrt/xgb-seed23.json',
 '../input/ump-artifacts/xgboost-gbrt/xgb-seed19.json',
 '../input/ump-artifacts/xgboost-gbrt/xgb-seed7.json']

[<xgboost.core.Booster at 0x7f4de3073190>,
 <xgboost.core.Booster at 0x7f4de68ad290>,
 <xgboost.core.Booster at 0x7f4de309c450>,
 <xgboost.core.Booster at 0x7f4de3073510>,
 <xgboost.core.Booster at 0x7f4de3073550>]

***
## model blending on newest data

In [6]:
feat_names = [f"f_{i}" for i in range(300)]
col_names = feat_names + ["target"]

supp_data = pd.read_csv("../input/ubiquant-market-prediction/supplemental_train.csv", usecols=col_names)
supp_data = reduce_mem_usage(supp_data, verbose=True)
gc.collect()

Mem. usage decreased to 1082.73 Mb (50.0% reduction)


30

In [7]:
def predict(boosters, dataframe, backend="lightgbm"):
    features = [f"f_{i}" for i in range(300)]
    if backend == "lightgbm":
        preds = [model.predict(dataframe[features]) for model in boosters]
    elif backend == "xgboost":
        dmatrix = xgb.DMatrix(data=dataframe[features])
        preds = [model.predict(dmatrix) for model in boosters]
    elif backend == "catboost":
        pass
    return np.mean(preds, axis=0)

In [8]:
%%time
supp_data["lgbm_gbrt_dart"] = predict(boosters_lgbm_dart, supp_data, backend="lightgbm")
supp_data["lgbm_linear_dart"] = predict(boosters_lgbm_linear_dart, supp_data, backend="lightgbm")
supp_data["xgb_gbrt"] = predict(boosters_xgb_gbrt, supp_data, backend="xgboost")

CPU times: user 1h 2min 38s, sys: 7.27 s, total: 1h 2min 45s
Wall time: 16min 6s


In [9]:
X = supp_data[["lgbm_gbrt_dart","lgbm_linear_dart","xgb_gbrt"]].values
y = supp_data["target"].values

In [10]:
def objective(trial):
    w1 = trial.suggest_uniform("w1", 0, 1)
    w2 = trial.suggest_uniform("w2", 0, 1)
    w3 = trial.suggest_uniform("w3", 0, 1)
    total_weight = w1+w2+w3
    
    blend_preds = (w1*X[:,0] + w2*X[:,1] + w3*X[:,2]) / total_weight
    
    # calculate person correlation
    return stats.pearsonr(y, blend_preds)[0]

In [11]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

study = optuna.create_study(
    study_name="optimal_blend",
    direction='maximize',
)
study.optimize(
    objective, 
    n_trials=10000, 
    timeout=3600,
    n_jobs=1,
    show_progress_bar=True
) 

  self._init_valid()


  0%|          | 0/10000 [00:00<?, ?it/s]

In [12]:
study.trials_dataframe().sort_values("value", ascending=False)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_w1,params_w2,params_w3,state
8557,8557,0.002175,2022-04-20 22:02:06.716803,2022-04-20 22:02:07.050718,0 days 00:00:00.333915,0.850704,0.239817,0.000003,COMPLETE
9110,9110,0.002175,2022-04-20 22:05:19.588474,2022-04-20 22:05:19.825146,0 days 00:00:00.236672,0.786069,0.221156,0.000007,COMPLETE
7162,7162,0.002175,2022-04-20 21:54:32.417771,2022-04-20 21:54:32.617799,0 days 00:00:00.200028,0.829492,0.232689,0.000011,COMPLETE
6986,6986,0.002175,2022-04-20 21:53:39.983419,2022-04-20 21:53:40.306079,0 days 00:00:00.322660,0.803509,0.227033,0.000012,COMPLETE
6674,6674,0.002175,2022-04-20 21:52:19.849352,2022-04-20 21:52:20.121198,0 days 00:00:00.271846,0.813491,0.222354,0.000016,COMPLETE
...,...,...,...,...,...,...,...,...,...
1,1,0.001755,2022-04-20 21:34:19.918417,2022-04-20 21:34:19.934258,0 days 00:00:00.015841,0.137078,0.357924,0.788384,COMPLETE
5066,5066,0.001719,2022-04-20 21:44:46.562808,2022-04-20 21:44:46.715894,0 days 00:00:00.153086,0.185114,0.157057,0.749482,COMPLETE
20,20,0.001709,2022-04-20 21:34:20.358992,2022-04-20 21:34:20.384989,0 days 00:00:00.025997,0.293225,0.116516,0.997201,COMPLETE
67,67,0.001709,2022-04-20 21:34:21.758835,2022-04-20 21:34:21.786935,0 days 00:00:00.028100,0.180725,0.234716,0.943356,COMPLETE


In [13]:
blend_params = dict(study.best_params)
blend_params

{'w1': 0.8507043739249086,
 'w2': 0.2398174716309868,
 'w3': 3.243916114959873e-06}

***
## inference

In [14]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

In [15]:
for (test_df, sample_prediction_df) in iter_test:
    preds1 = predict(boosters_lgbm_dart, test_df, backend="lightgbm")
    preds2 = predict(boosters_lgbm_linear_dart, test_df, backend="lightgbm")
    preds3 = predict(boosters_xgb_gbrt, test_df, backend="xgboost")
    
    blend_preds  = blend_params["w1"]*preds1 + blend_params["w2"]*preds2 + blend_params["w3"]*preds3
    blend_preds = blend_preds / sum(blend_params.values())
    
    sample_prediction_df['target'] = blend_preds
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Unnamed: 0,row_id,target
0,1220_1,-0.141269
1,1220_2,-0.110443


Unnamed: 0,row_id,target
0,1221_0,-0.10954
1,1221_1,-0.140778
2,1221_2,-0.136296


Unnamed: 0,row_id,target
0,1222_0,-0.07262
1,1222_1,-0.140486
2,1222_2,-0.115629


Unnamed: 0,row_id,target
0,1223_0,-0.108669


***