## Import library

In [1]:
import lightgbm as lgb

from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc,roc_curve,log_loss

import numpy as np
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel

from tqdm import tqdm_notebook


pd.options.display.max_columns = 200

## Data import

In [8]:
train_gal = pd.read_pickle('../input/train_gal_20181214.pickle')
train_ex_gal = pd.read_pickle('../input/train_ex_gal_20181214.pickle')
test_gal = pd.read_pickle('../input/test_gal_20181214.pickle')
test_ex_gal = pd.read_pickle('../input/test_ex_gal_20181214.pickle')

In [9]:
train_sixth_band = pd.read_pickle('../input/train_sixth_byband.pickle')
test_sixth_band = pd.read_pickle('../input/test_sixth_byband.pickle')

In [10]:
train_fix_band_0_stats = pd.read_pickle('../input/train_band_0_fix_stats.pickle')
test_fix_band_0_stats = pd.read_pickle('../input/test_band_0_fix_stats.pickle')
train_fix_band_1_stats = pd.read_pickle('../input/train_band_1_fix_stats.pickle')
test_fix_band_1_stats = pd.read_pickle('../input/test_band_1_fix_stats.pickle')

In [11]:
train_gal = pd.merge(train_gal,train_fix_band_0_stats,on='object_id',how='left')
train_gal = pd.merge(train_gal,train_fix_band_1_stats,on='object_id',how='left')

train_ex_gal = pd.merge(train_ex_gal,train_sixth_band,on='object_id',how='left')
train_ex_gal = pd.merge(train_ex_gal,train_fix_band_0_stats,on='object_id',how='left')
train_ex_gal = pd.merge(train_ex_gal,train_fix_band_1_stats,on='object_id',how='left')



In [12]:
test_gal = pd.merge(test_gal,test_fix_band_0_stats,on='object_id',how='left')
test_gal = pd.merge(test_gal,test_fix_band_1_stats,on='object_id',how='left')

test_ex_gal = pd.merge(test_ex_gal,test_sixth_band,on='object_id',how='left')
test_ex_gal = pd.merge(test_ex_gal,test_fix_band_0_stats ,on='object_id',how='left')
test_ex_gal = pd.merge(test_ex_gal,test_fix_band_1_stats ,on='object_id',how='left')

In [13]:
train_ex_gal.head(3)

Unnamed: 0,index,object_id,gal_b,hostgal_photoz,hostgal_photoz_err,target,haversine,latlon1,diff_hostgal,diff_hostgal_abs,passband_0_ratio_detect,passband_1_ratio_detect,passband_2_ratio_detect,passband_3_ratio_detect,passband_4_ratio_detect,passband_5_ratio_detect,passband_0_ratio,passband_1_ratio,passband_2_ratio,passband_3_ratio,passband_4_ratio,passband_5_ratio,ratio_by_band_median,ratio_by_band_var,flux_max_detected,flux_min_detected,flux_mean_detected,flux_median_detected,flux_skew_detected,flux_max_min_detected,flux_max,flux_min,flux_mean,flux_median,flux_skew,flux_max_min,detected_mean,mjd_diff,mjd_halfflux,mjd_sixthflux,diff_baba_mjd,max_flux_0_1,"flux__fft_coefficient__coeff_0__attr_""abs""","flux__fft_coefficient__coeff_1__attr_""abs""",flux__longest_strike_above_mean,flux__number_crossing_m__m_1,symm,"0__fft_coefficient__coeff_0__attr_""abs""","0__fft_coefficient__coeff_1__attr_""abs""",0__longest_strike_above_mean,0__number_crossing_m__m_1,0__ratio_beyond_r_sigma__r_0.5,0__sample_entropy,"1__fft_coefficient__coeff_0__attr_""abs""","1__fft_coefficient__coeff_1__attr_""abs""",1__longest_strike_above_mean,1__number_crossing_m__m_1,1__ratio_beyond_r_sigma__r_0.5,1__sample_entropy,"2__fft_coefficient__coeff_0__attr_""abs""","2__fft_coefficient__coeff_1__attr_""abs""",2__longest_strike_above_mean,2__number_crossing_m__m_1,2__ratio_beyond_r_sigma__r_0.5,2__sample_entropy,"3__fft_coefficient__coeff_0__attr_""abs""","3__fft_coefficient__coeff_1__attr_""abs""",3__longest_strike_above_mean,3__number_crossing_m__m_1,3__ratio_beyond_r_sigma__r_0.5,3__sample_entropy,"4__fft_coefficient__coeff_0__attr_""abs""","4__fft_coefficient__coeff_1__attr_""abs""",4__longest_strike_above_mean,4__number_crossing_m__m_1,4__ratio_beyond_r_sigma__r_0.5,4__sample_entropy,"5__fft_coefficient__coeff_0__attr_""abs""","5__fft_coefficient__coeff_1__attr_""abs""",5__longest_strike_above_mean,5__number_crossing_m__m_1,5__ratio_beyond_r_sigma__r_0.5,5__sample_entropy,flux_ratio_sq_sum,flux_ratio_sq_skew,flux_by_flux_ratio_sq_sum,flux_by_flux_ratio_sq_skew,flux_dif,flux_dif2,flux_w_mean,flux_dif3,"flux_max__fft_coefficient__coeff_0__attr_""abs""","flux_max__fft_coefficient__coeff_1__attr_""abs""",flux_max__kurtosis,flux_max__skewness,"flux_min__fft_coefficient__coeff_0__attr_""abs""","flux_min__fft_coefficient__coeff_1__attr_""abs""",flux_min__kurtosis,flux_min__skewness,"flux_mean__fft_coefficient__coeff_0__attr_""abs""",...,flux_err_rate_min,flux_err_rate_mean,flux_err_rate_median,flux_err_rate_var,flux_err_rate_skew,flux_err_rate_max_min,0__kurtosis,0__skewness,1__kurtosis,1__skewness,2__kurtosis,2__skewness,3__kurtosis,3__skewness,4__kurtosis,4__skewness,5__kurtosis,5__skewness,0__maximum,0__mean,0__median,0__minimum,0__standard_deviation,1__maximum,1__mean,1__median,1__minimum,1__standard_deviation,2__maximum,2__mean,2__median,2__minimum,2__standard_deviation,3__maximum,3__mean,3__median,3__minimum,3__standard_deviation,4__maximum,4__mean,4__median,4__minimum,4__standard_deviation,5__maximum,5__mean,5__median,5__minimum,5__standard_deviation,diff_flux__kurtosis_diff,diff_flux__maximum_diff,diff_flux__mean_diff,diff_flux__median_diff,diff_flux__minimum_diff,diff_flux__skewness_diff,diff_flux__standard_deviation_diff,diff_flux__maximum_diff_det,diff_flux__median_diff_det,diff_flux__minimum_diff_det,0__maximum_diff,0__median_diff,0__minimum_diff,1__maximum_diff,1__median_diff,1__minimum_diff,2__maximum_diff,2__median_diff,2__minimum_diff,3__maximum_diff,3__median_diff,3__minimum_diff,4__maximum_diff,4__median_diff,4__minimum_diff,5__maximum_diff,5__median_diff,5__minimum_diff,sixth_band_0,sixth_band_1,sixth_band_2,sixth_band_3,sixth_band_4,sixth_band_5,second_band_0,second_band_1,second_band_2,second_band_3,second_band_4,second_band_5,band_0_flux_max,band_0_flux_min,band_0_flux_mean,band_0_flux_med,band_0_flux_skew,band_0_flux_std,band_1_flux_max,band_1_flux_min,band_1_flux_mean,band_1_flux_med,band_1_flux_skew,band_1_flux_std
0,0,615,-51.753706,0.0,0.0,92,0.319006,-1.528827,0.0,0.0,0.162162,0.168168,0.171171,0.171171,0.171171,0.156156,0.178977,0.164773,0.164773,0.164773,0.164773,0.161932,0.164773,3.8e-05,660.626343,-1100.440063,-130.266769,-102.768921,-0.291382,7.473675,0.946023,660.626343,-1100.440063,-123.097,-89.477524,-0.34954,7.473675,873.792969,0.015625,13.996094,13.980469,-535.443542,43330.143254,15303.668045,19.0,61.0,0.985612,205.036927,1628.427737,5.0,18.0,0.793651,2.01644,22370.594833,2806.374162,2.0,21.0,0.793103,1.933884,7780.50081,2805.59811,2.0,25.0,0.741379,1.959309,7024.003069,2536.068845,3.0,25.0,0.758621,1.861271,3245.36635,2741.539788,5.0,23.0,0.775862,1.921409,2704.641263,2893.344216,5.0,23.0,0.77193,1.886318,-960176600.0,-1.414322,2929669.0,0.812722,7.473675,-14.306331,-327.74234,-5.373326,7.864678,6.281597,0.314536,-0.527777,8.094029,6.729821,1.487434,-0.763272,6.61688,...,-3.925288,-4.159871,1.146181,1.219583,2.371641,1.203312,-1.475181,0.128917,-1.255123,0.415581,-1.409885,0.339918,-1.449858,0.293128,-1.548319,0.200096,-1.59282,0.125269,125.182808,-3.254554,-10.015225,-116.913223,83.275841,660.626343,-385.699921,-488.057983,-1100.440063,596.576904,611.984558,-134.146576,-265.686005,-681.858887,451.180817,445.737061,-121.1035,-162.170944,-530.644592,332.520874,381.953735,-55.95459,-103.541367,-422.184509,289.276978,378.188141,-47.449844,-85.524307,-422.815094,292.182281,1.561865,1007.002441,202.114075,107.974808,0.076019,1.432323,210.771271,1007.002441,126.217438,0.076019,191.556,39.526798,0.170685,1745.011963,573.419739,4.854187,1280.46814,412.171204,6.889648,879.547913,347.493164,1.488678,767.241089,252.360184,4.495239,734.515442,214.830948,5.91217,3.011719,241.230469,241.230469,16.023438,3.003906,3.0,2.003906,241.230469,241.230469,13.054688,3.003906,3.0,125.1828,-116.9132,-3.254554,-10.01523,0.128917,83.94473,660.6263,-1100.44,-259.9232,-349.215576,0.18558,546.0072
1,1,713,-54.460748,1.6267,0.2552,88,1.698939,3.258921,0.1914,0.1914,0.166667,0.25,0.25,0.25,0.083333,0.0,0.2,0.16,0.16,0.16,0.16,0.16,0.16,0.000267,11.330316,-12.394593,-6.061436,-8.975166,1.308961,3.166525,0.171429,14.770886,-14.735178,-1.423351,-0.873032,0.014989,3.384596,846.800781,3.042969,3.042969,0.0,5.380808,498.172763,1258.252791,50.0,47.0,0.986149,190.427854,299.58656,33.0,9.0,0.757143,2.179983,57.109049,192.53923,15.0,7.0,0.678571,2.335591,44.477326,191.057527,20.0,7.0,0.767857,2.124602,55.270115,212.522263,20.0,9.0,0.767857,2.146581,50.414646,203.892482,19.0,7.0,0.821429,2.221544,100.473774,143.963093,8.0,15.0,0.732143,2.302585,-28750.87,-3.454554,5886.068,3.439423,3.384596,-20.730003,-4.884563,-6.040676,4.24988,1.878509,-1.843684,0.590616,4.322959,1.758719,-0.496756,0.102398,2.106064,...,-0.756345,-0.927378,1.059688,0.985793,2.137273,1.898203,-1.014003,0.260052,-1.09717,-0.087865,-1.188472,-0.022678,-1.142896,-0.167176,-1.190245,-0.064134,-0.797047,0.218182,14.509829,-2.720398,-3.096805,-14.735178,7.062516,9.129021,-1.019804,-0.561736,-11.715749,5.661101,10.529041,-0.794238,-0.117977,-10.067919,5.718981,11.330316,-0.986966,-0.073897,-12.394593,6.392561,9.827934,-0.900262,-0.792176,-12.286801,6.349526,14.770886,-1.794175,-2.463012,-14.211164,7.030447,4.634613,17.386347,2.935177,2.075944,0.004893,1.955968,2.98225,11.521006,1.08427,0.017147,11.286276,2.835078,0.004893,9.559752,1.941535,0.024763,10.25297,1.51929,0.007987,8.45234,2.845063,0.155172,14.169703,2.33557,0.017561,18.792925,4.343699,0.085589,30.027344,26.027344,26.019531,40.039062,26.023438,15.078125,1.0,17.007812,8.003906,14.011719,11.003906,3.09375,5640157000.0,-5426431000.0,-685092600.0,-940483200.0,0.218182,2708821000.0,,,,,,
2,2,730,-61.548219,0.2262,0.0157,42,1.81803,3.128522,0.0058,0.0058,0.0,0.0,0.304348,0.304348,0.217391,0.173913,0.218182,0.157576,0.157576,0.157576,0.154545,0.154545,0.157576,0.000639,47.310059,2.112415,24.131868,20.994711,0.081988,3.811045,0.069697,47.310059,-19.159811,2.267434,0.409172,3.177854,4.196749,78.773438,2.964844,2.972656,0.007812,0.249057,748.253238,680.616684,13.0,113.0,0.936685,3.46179,4.729538,6.0,27.0,0.666667,2.200752,7.334944,13.515895,5.0,17.0,0.5,2.133676,124.84525,119.500255,9.0,17.0,0.480769,1.149667,168.280524,162.799417,9.0,18.0,0.403846,1.309389,219.745132,202.532898,9.0,19.0,0.372549,1.307157,231.509178,199.286369,7.0,20.0,0.431373,1.813163,104650.2,5.989138,4124.452,5.480405,4.196749,29.315018,25.37311,2.619698,5.041308,4.134364,-2.098943,-0.102671,3.692219,2.832126,5.221543,-2.250251,2.679523,...,0.253669,0.171545,1.487815,0.05356,2.359559,3.253634,0.474215,0.35691,0.976374,0.471342,5.13129,2.385066,7.125666,2.662075,6.081065,2.537802,3.58313,1.680352,5.942166,-0.04808,0.024093,-3.45996,1.816127,5.693109,0.141057,0.171335,-3.39308,1.789767,20.994711,2.40087,0.491791,-2.848838,5.505767,33.572102,3.236164,0.660402,-5.435799,8.112836,41.159981,4.308728,1.004354,-5.83631,10.604821,47.310059,4.539396,2.542647,-19.159811,13.201397,12.301396,39.955154,4.227614,2.431124,0.017004,3.12489,5.339545,27.859081,8.11568,0.547215,6.123136,1.744857,0.017004,5.713929,1.485067,0.112996,20.706535,1.279887,0.046416,32.369781,2.275236,0.059754,46.211815,2.600007,0.010972,60.048317,6.930541,0.020015,1.023438,9.96875,78.773438,72.792969,78.773438,69.8125,1.023438,9.96875,60.820312,41.890625,47.871094,57.84375,8839063.0,-5268061.0,219003.1,266013.7,0.471342,2805885.0,32596170.0,-4423077.0,3727567.0,763548.915109,2.385066,8631597.0


In [14]:
train_ex_gal['max_fix_flux_diff_0_1'] = train_ex_gal['band_0_flux_max'] - train_ex_gal['band_1_flux_max']

In [15]:
test_ex_gal['max_fix_flux_diff_0_1'] = test_ex_gal['band_0_flux_max'] - test_ex_gal['band_1_flux_max']

In [16]:
def check_null(data):
    print(data.isnull().sum())

pd.set_option('display.max_rows', 500)
#print(len(train_extra_galaxy))
#print(len(test_extra_galaxy))
#print(check_null(train_gal))
#print(check_null(train_oof_summary))
#print(check_null(test_oof_summary))
#print(check_null(train_ex_gal))
#print(check_null(test_ex_gal))
#print(check_null(test_gal))
#print(check_null(test_extra_galaxy))

In [17]:
#drop more feature
drop_list = ['0__minimum_diff','1__minimum_diff','2__minimum_diff','3__minimum_diff','4__minimum_diff','5__minimum_diff'
             ,'diff_flux__minimum_diff','diff_flux__minimum_diff_det','index','flux_dif']

In [18]:
train_ex_gal = train_ex_gal.drop(drop_list,axis=1)
test_ex_gal = test_ex_gal.drop(drop_list,axis=1)

In [19]:
train_gal = train_gal.replace([np.inf, -np.inf], np.nan)
train_ex_gal = train_ex_gal.replace([np.inf, -np.inf], np.nan)

train_gal_mean = train_gal.mean(axis=0)
train_ex_gal_mean = train_ex_gal.mean(axis=0)

train_gal.fillna(train_gal_mean, inplace=True)
train_ex_gal.fillna(train_ex_gal_mean, inplace=True)

In [20]:
# add mean → good result

test_gal = test_gal.replace([np.inf, -np.inf], np.nan)
test_ex_gal = test_ex_gal.replace([np.inf, -np.inf], np.nan)

test_gal = test_gal.fillna(train_gal_mean)
test_ex_gal = test_ex_gal.fillna(train_ex_gal_mean)

In [21]:
import sys
# check used_momery
print(train_gal.shape,"行×列")
print("train_gal:使用メモリ",round(sys.getsizeof(train_gal) / 1073741824,2),"GB")

print(train_ex_gal.shape,"行×列")
print("train_gal:使用メモリ",round(sys.getsizeof(train_ex_gal) / 1073741824,2),"GB")

(7848, 167) 行×列
train_gal:使用メモリ 0.01 GB
(7848, 210) 行×列
train_gal:使用メモリ 0.01 GB


In [22]:
import sys
# check used_momery
print(test_gal.shape,"行×列")
print("test:使用メモリ",round(sys.getsizeof(test_gal) / 1073741824,2),"GB")

print(test_ex_gal.shape,"行×列")
print("test:使用メモリ",round(sys.getsizeof(test_ex_gal) / 1073741824,2),"GB")

(3492890, 166) 行×列
test:使用メモリ 3.92 GB
(3492890, 209) 行×列
test:使用メモリ 4.89 GB


## modeling_lgbm

In [23]:
# split galaxica
train_galaxy = train_gal[train_gal['hostgal_photoz'] == 0]
train_extra_galaxy = train_ex_gal[train_ex_gal['hostgal_photoz'] != 0]

In [24]:
test_galaxy = test_gal[test_gal['hostgal_photoz'] == 0]
test_extra_galaxy = test_ex_gal[test_ex_gal['hostgal_photoz'] != 0]

In [25]:
gal_y = train_galaxy['target']
ex_gal_y = train_extra_galaxy['target']
train_gal_id = train_galaxy['object_id']
train_ex_gal_id = train_extra_galaxy['object_id']

del train_galaxy['target'],train_extra_galaxy['target']

In [26]:
del train_extra_galaxy['object_id'],train_galaxy['object_id']

In [27]:
test_gal_id = test_galaxy['object_id']
test_ex_gal_id = test_extra_galaxy['object_id']
del test_galaxy['object_id']

In [28]:
del test_extra_galaxy['object_id']

In [29]:
# binary val_galaxy
from sklearn.preprocessing import OneHotEncoder
target_enc = OneHotEncoder()
gal_X = target_enc.fit_transform(gal_y.values.reshape(-1,1)).toarray()
ex_gal_X = target_enc.fit_transform(ex_gal_y.values.reshape(-1,1)).toarray()
gal_class_binary = pd.DataFrame(gal_X, columns = ["target_6","target_16","target_53","target_65","target_92"])
ex_gal_class_binary = pd.DataFrame(ex_gal_X, columns = ["target_15","target_42","target_52","target_62","target_64","target_67","target_88","target_90","target_95"])

In [49]:
def train_binary_classifiers(full_train=None, y=None, seed=1):

    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    clfs = []
    logloss = []
    importances = pd.DataFrame()
    # conclusion:defalut is great!
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        #'num_class': 14,
        'metric': 'logloss', #1214 change logloss
        'learning_rate': 0.005, #def:0.03
        'subsample': .8, #def:.8
        'colsample_bytree': .7, #def:.7
        'reg_alpha': .01, #def:.01 
        'reg_lambda': .01, #def:.01 
        'min_split_gain': 0.01, #def:0.01
        'min_child_weight': 10, # def:10
        'n_estimators': 100000,
        'silent': -1,
        'verbose': -1,
        'max_depth': 7,
        #'sample_pos_weight':pos_weight
    }
    
    oof_preds = np.zeros((len(full_train), np.unique(y).shape[0]))
    
    # photoz_split
    target_photoz = pd.cut(full_train.hostgal_photoz, 10)
    le = preprocessing.LabelEncoder()
    le.fit(target_photoz)
    target_photoz = le.transform(target_photoz) 
    
    #for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    for fold_, (trn_, val_) in enumerate(folds.split(target_photoz, target_photoz)):    
        trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_]
        val_x, val_y = full_train.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            #eval_metric=lgb_multi_weighted_logloss,
            eval_metric='binary_logloss',
            verbose=2000,
            early_stopping_rounds=100,
            #sample_weight=trn_y.map(weights)
            #callbacks=[lgb.reset_parameter(max_depth=[4] * 100 + [2] * 300 + [1] * 3600)]
            #callbacks=[lgb.reset_parameter(
            #    learning_rate=[0.01] * 100 + [0.03] * 1000 + [0.05] * 2900
            #    ,max_depth=[5] * 100 + [3] * 1000 + [2] * 2900
            #)]
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        #get_logger().info(multi_weighted_logloss(val_y, clf.predict_proba(val_x, num_iteration=clf.best_iteration_)))

        imp_df = pd.DataFrame()
        imp_df['feature'] = full_train.columns
        imp_df['gain'] = clf.feature_importances_
        imp_df['fold'] = fold_ + 1
        importances = pd.concat([importances, imp_df], axis=0, sort=False)

        clfs.append(clf)
    
    logloss = log_loss(y, oof_preds)
    #fpr, tpr, thresholds = roc_curve(y, oof_preds[:,1], pos_label=1)
    print('total_logloss : %.5f ' % logloss)
    #auc_class = auc(fpr, tpr)
    
    return clfs, importances, oof_preds, logloss


In [45]:
import matplotlib.pyplot as plt
import seaborn as sns
def save_importances(importances_,layout=True):
    mean_gain = importances_[['gain', 'feature']].groupby('feature').mean()
    importances_['mean_gain'] = importances_['feature'].map(mean_gain['gain'])
    if layout==True:
        plt.figure(figsize=(8, 50))
        sns.barplot(x='gain', y='feature', data=importances_.sort_values('mean_gain', ascending=False))
        plt.tight_layout()
    else:
        None

In [46]:
def galaxy_model(galaxy='gal',pred=False, layout=False, seed_list=[1,2]):
    importance_dm = pd.DataFrame()
    oof_merge = pd.DataFrame()
    preds_merge = pd.DataFrame()
    oof_total = pd.DataFrame()
    preds_total = pd.DataFrame()
    logloss_total = []
    
    lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    #'num_class': 14,
    'metric': 'logloss', #1214 change logloss
    'learning_rate': 0.03, #def:0.03
    'subsample': .8, #def:.8
    'colsample_bytree': .7, #def:.7
    'reg_alpha': .01, #def:.01 
    'reg_lambda': .01, #def:.01 
    'min_split_gain': 0.01, #def:0.01
    'min_child_weight': 10, # def:10
    'n_estimators': 100000,
    'silent': -1,
    'verbose': -1,
    'max_depth': 3,
    }
    clf = lgb.LGBMClassifier(**lgb_params)
    
    # seed averaging
    for seed_num in seed_list:
        print('------------ seed {} start ------------'.format(seed_num))
        
        if galaxy=='gal':
            for i in range(gal_class_binary.shape[1]):

                y = gal_class_binary.iloc[:,i]
                
                clfs, importances, oof_preds, logloss = train_binary_classifiers(train_galaxy, y, seed_num)
                oof_preds = pd.DataFrame(oof_preds[:,1])
                save_importances(importances_=importances,layout=layout)
                importances = importances[['feature','mean_gain']].drop_duplicates()
                importances.columns = ['feature',y.to_frame().columns[0]]

                if pred == True:
                    preds_ = None
                    for clf in tqdm_notebook(clfs):
                        if preds_ is None:
                            preds_ = clf.predict_proba(test_galaxy) / len(clfs)
                        else:
                            preds_ += clf.predict_proba(test_galaxy) / len(clfs)
                    preds_ = pd.DataFrame(preds_[:,1])

                if i == 0:
                    importance_dm = importances
                    oof_total = oof_preds

                    if pred == True:
                        preds_total = preds_
                else:
                    importance_dm = pd.merge(importance_dm,importances,on='feature',how='left')
                    oof_total = pd.merge(oof_total,oof_preds,left_index=True,right_index=True) 
                    if pred == True:
                        preds_total = pd.merge(preds_total,preds_,left_index=True,right_index=True)
                logloss_total.append(logloss)

            oof_total.columns = ["target_6","target_16","target_53","target_65","target_92"]

            if pred == True:
                preds_total.columns = ["target_6","target_16","target_53","target_65","target_92"]

        if galaxy=='ex_gal':        
            for i in range(ex_gal_class_binary.shape[1]):
                y = ex_gal_class_binary.iloc[:,i]
                
                #selector
                #lgb_selector = SelectFromModel(clf)
                #lgb_selector.fit(train_extra_galaxy, y)
                #feature_idx = lgb_selector.get_support() 
                #feature_name = train_extra_galaxy.columns[feature_idx]

                clfs, importances, oof_preds, logloss = train_binary_classifiers(train_extra_galaxy, y, seed_num)

                oof_preds = pd.DataFrame(oof_preds[:,1])
                save_importances(importances_=importances,layout=layout)
                importances = importances[['feature','mean_gain']].drop_duplicates()
                importances.columns = ['feature',y.to_frame().columns[0]]

                if pred == True:
                    preds_ = None

                    for clf in tqdm_notebook(clfs):
                        if preds_ is None:
                            preds_ = clf.predict_proba(test_extra_galaxy) / len(clfs)
                        else:
                            preds_ += clf.predict_proba(test_extra_galaxy) / len(clfs)

                    preds_ = pd.DataFrame(preds_[:,1])

                if i == 0:
                    importance_dm = importances
                    oof_total = oof_preds
                    if pred == True:
                        preds_total = preds_
                else:
                    importance_dm = pd.merge(importance_dm,importances,on='feature',how='left')
                    oof_total = pd.merge(oof_total,oof_preds,left_index=True,right_index=True) 
                    if pred == True:
                        preds_total = pd.merge(preds_total,preds_,left_index=True,right_index=True)
                logloss_total.append(logloss)

            oof_total.columns = ["target_15","target_42","target_52","target_62","target_64","target_67","target_88","target_90","target_95"]
            if pred == True:
                preds_total.columns = ["target_15","target_42","target_52","target_62","target_64","target_67","target_88","target_90","target_95"]
    
        # merge seed_result
        if seed_num == 1:
            oof_merge = oof_total / len(seed_list)
            if pred == True:
                preds_merge = preds_total / len(seed_list)
        else:
            oof_merge += oof_total / len(seed_list)
            if pred == True:
                preds_merge += preds_total / len(seed_list)

    if pred == True:    
        #return importance_dm,oof_total,preds_total,logloss_total
        return importance_dm,oof_merge,preds_merge,logloss_total
    else:
        #return importance_dm,oof_total,logloss_total
        return importance_dm,oof_merge,logloss_total

In [280]:
%%time
# 予測
print('---------------  start gal_model pred  ---------------')
importances_gal,oof_gal_total,preds_gal,logloss_gal = galaxy_model(galaxy='gal',pred=True,layout=False,seed_list=[1])

print('---------------  start ex_gal_model pred  ---------------')
importances_ex_gal,oof_ex_gal_total,preds_ex_gal,logloss_ex_gal = galaxy_model(galaxy='ex_gal',pred=True,layout=False,seed_list=[1])

print('---------------  done model pred  ---------------')
print(np.round(logloss_gal, decimals=4))
print('total_gal_logloss:',np.array(logloss_gal).mean())
print(np.round(logloss_ex_gal, decimals=4))
print('total_ex_gal_logloss:',np.array(logloss_ex_gal).mean())

---------------  start gal_model pred  ---------------
------------ seed 1 start ------------
Training until validation scores don't improve for 50 rounds.
[1000]	training's binary_logloss: 0.0146161	valid_1's binary_logloss: 0.0351148
Early stopping, best iteration is:
[961]	training's binary_logloss: 0.0146161	valid_1's binary_logloss: 0.0351148
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[846]	training's binary_logloss: 0.0144954	valid_1's binary_logloss: 0.0354625
Training until validation scores don't improve for 50 rounds.
[1000]	training's binary_logloss: 0.0146415	valid_1's binary_logloss: 0.0280187
Early stopping, best iteration is:
[978]	training's binary_logloss: 0.0146415	valid_1's binary_logloss: 0.0280187
Training until validation scores don't improve for 50 rounds.
[1000]	training's binary_logloss: 0.0157181	valid_1's binary_logloss: 0.0349502
Early stopping, best iteration is:
[962]	training's binary_logloss: 0.015718

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[630]	training's binary_logloss: 0.0139718	valid_1's binary_logloss: 0.039746
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[641]	training's binary_logloss: 0.0134806	valid_1's binary_logloss: 0.0327738
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[630]	training's binary_logloss: 0.0132439	valid_1's binary_logloss: 0.0364806
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[705]	training's binary_logloss: 0.0134289	valid_1's binary_logloss: 0.0236402
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[669]	training's binary_logloss: 0.0130456	valid_1's binary_logloss: 0.0440345
total_logloss : 0.03534 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[93]	training's binary_logloss: 0.0186759	valid_1's binary_logloss: 0.0171776
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[107]	training's binary_logloss: 0.0197236	valid_1's binary_logloss: 0.0182755
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[98]	training's binary_logloss: 0.0255409	valid_1's binary_logloss: 0.0343083
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[103]	training's binary_logloss: 0.0224742	valid_1's binary_logloss: 0.0267565
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[101]	training's binary_logloss: 0.0225492	valid_1's binary_logloss: 0.0267664
total_logloss : 0.02466 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[653]	training's binary_logloss: 0.0128052	valid_1's binary_logloss: 0.0228059
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[618]	training's binary_logloss: 0.0129591	valid_1's binary_logloss: 0.0382382
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[662]	training's binary_logloss: 0.0128362	valid_1's binary_logloss: 0.0192374
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[673]	training's binary_logloss: 0.0133222	valid_1's binary_logloss: 0.0252627
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[621]	training's binary_logloss: 0.0131557	valid_1's binary_logloss: 0.0511184
total_logloss : 0.03133 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[226]	training's binary_logloss: 0.0173008	valid_1's binary_logloss: 0.0285877
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[236]	training's binary_logloss: 0.0172757	valid_1's binary_logloss: 0.0249926
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[220]	training's binary_logloss: 0.0174957	valid_1's binary_logloss: 0.030339
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[218]	training's binary_logloss: 0.0176739	valid_1's binary_logloss: 0.0387089
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[225]	training's binary_logloss: 0.0171905	valid_1's binary_logloss: 0.03058
total_logloss : 0.03064 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


---------------  start ex_gal_model pred  ---------------
------------ seed 1 start ------------
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[356]	training's binary_logloss: 0.0360528	valid_1's binary_logloss: 0.146732
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[430]	training's binary_logloss: 0.0323516	valid_1's binary_logloss: 0.112663
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[414]	training's binary_logloss: 0.0322314	valid_1's binary_logloss: 0.113195
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[398]	training's binary_logloss: 0.035321	valid_1's binary_logloss: 0.112292
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[453]	training's binary_logloss: 0.0307952	valid_1's binary_logloss: 0.116346
total_logloss : 0.12028 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[452]	training's binary_logloss: 0.131151	valid_1's binary_logloss: 0.341716
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[650]	training's binary_logloss: 0.102725	valid_1's binary_logloss: 0.32512
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[662]	training's binary_logloss: 0.102364	valid_1's binary_logloss: 0.30296
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[691]	training's binary_logloss: 0.0984422	valid_1's binary_logloss: 0.311352
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[468]	training's binary_logloss: 0.134716	valid_1's binary_logloss: 0.288856
total_logloss : 0.31406 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[189]	training's binary_logloss: 0.0572939	valid_1's binary_logloss: 0.140633
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[293]	training's binary_logloss: 0.0439301	valid_1's binary_logloss: 0.114804
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[116]	training's binary_logloss: 0.0759401	valid_1's binary_logloss: 0.131431
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[353]	training's binary_logloss: 0.0400163	valid_1's binary_logloss: 0.0858366
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[202]	training's binary_logloss: 0.0565483	valid_1's binary_logloss: 0.114969
total_logloss : 0.11757 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[266]	training's binary_logloss: 0.0926742	valid_1's binary_logloss: 0.218991
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[352]	training's binary_logloss: 0.0739532	valid_1's binary_logloss: 0.203305
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[282]	training's binary_logloss: 0.0890792	valid_1's binary_logloss: 0.198076
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[412]	training's binary_logloss: 0.0696298	valid_1's binary_logloss: 0.183542
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[355]	training's binary_logloss: 0.0763176	valid_1's binary_logloss: 0.180187
total_logloss : 0.19686 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[562]	training's binary_logloss: 0.00676155	valid_1's binary_logloss: 0.0163979
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[576]	training's binary_logloss: 0.00683935	valid_1's binary_logloss: 0.0173985
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[482]	training's binary_logloss: 0.00704418	valid_1's binary_logloss: 0.023758
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[603]	training's binary_logloss: 0.00672933	valid_1's binary_logloss: 0.0159724
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[582]	training's binary_logloss: 0.00691102	valid_1's binary_logloss: 0.0178577
total_logloss : 0.01828 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[277]	training's binary_logloss: 0.0408441	valid_1's binary_logloss: 0.0999234
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[273]	training's binary_logloss: 0.040561	valid_1's binary_logloss: 0.0933843
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[335]	training's binary_logloss: 0.03474	valid_1's binary_logloss: 0.0975569
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[251]	training's binary_logloss: 0.0410506	valid_1's binary_logloss: 0.114267
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[285]	training's binary_logloss: 0.0388552	valid_1's binary_logloss: 0.108673
total_logloss : 0.10274 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[184]	training's binary_logloss: 0.00655337	valid_1's binary_logloss: 0.0174031
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[199]	training's binary_logloss: 0.00660881	valid_1's binary_logloss: 0.00439414
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[159]	training's binary_logloss: 0.00678619	valid_1's binary_logloss: 0.0234121
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[197]	training's binary_logloss: 0.00663707	valid_1's binary_logloss: 0.00528497
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[181]	training's binary_logloss: 0.00709975	valid_1's binary_logloss: 0.0114123
total_logloss : 0.01239 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[872]	training's binary_logloss: 0.0772249	valid_1's binary_logloss: 0.286534
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[854]	training's binary_logloss: 0.0780515	valid_1's binary_logloss: 0.328921
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[794]	training's binary_logloss: 0.0885213	valid_1's binary_logloss: 0.298572
Training until validation scores don't improve for 50 rounds.
[1000]	training's binary_logloss: 0.0684267	valid_1's binary_logloss: 0.25339
Early stopping, best iteration is:
[1409]	training's binary_logloss: 0.0437542	valid_1's binary_logloss: 0.247528
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[838]	training's binary_logloss: 0.0786419	valid_1's binary_logloss: 0.280389
total_logloss : 0.28843 


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[263]	training's binary_logloss: 0.0138467	valid_1's binary_logloss: 0.0297861
Training until validation scores don't improve for 50 rounds.
[1000]	training's binary_logloss: 0.0084041	valid_1's binary_logloss: 0.0228241
Early stopping, best iteration is:
[1423]	training's binary_logloss: 0.00733938	valid_1's binary_logloss: 0.0223329
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[368]	training's binary_logloss: 0.0113907	valid_1's binary_logloss: 0.0376164
Training until validation scores don't improve for 50 rounds.
[1000]	training's binary_logloss: 0.00828575	valid_1's binary_logloss: 0.0183805
Early stopping, best iteration is:
[1409]	training's binary_logloss: 0.00725254	valid_1's binary_logloss: 0.0179664
Training until validation scores don't improve for 50 rounds.
[1000]	training's binary_logloss: 0.0081361	valid_1's binary_logloss

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


---------------  done model pred  ---------------
[0.0378 0.0353 0.0247 0.0313 0.0306]
total_gal_logloss: 0.031957967311322784
[0.1203 0.3141 0.1176 0.1969 0.0183 0.1027 0.0124 0.2884 0.0263]
total_ex_gal_logloss: 0.13298510203787375
CPU times: user 1h 25min 30s, sys: 19min 24s, total: 1h 44min 54s
Wall time: 28min 52s


In [51]:
# check shape
print(oof_gal_total.shape)
print(oof_ex_gal_total.shape)
print(preds_gal.shape)
print(preds_ex_gal.shape)

(2325, 5)
(5523, 9)
(390510, 5)
(3102380, 9)


In [52]:
oof_gal_total['prob_sum'] = oof_gal_total.iloc[:,1:].sum(axis=1)
oof_gal_total['prob_med'] = oof_gal_total.iloc[:,1:].median(axis=1)
oof_ex_gal_total['prob_sum'] = oof_ex_gal_total.iloc[:,1:].sum(axis=1)
oof_ex_gal_total['prob_med'] = oof_ex_gal_total.iloc[:,1:].median(axis=1)

preds_gal['prob_sum'] = preds_gal.iloc[:,1:].sum(axis=1)
preds_gal['prob_med'] = preds_gal.iloc[:,1:].median(axis=1)
preds_ex_gal['prob_sum'] = preds_ex_gal.iloc[:,1:].sum(axis=1)
preds_ex_gal['prob_med'] = preds_ex_gal.iloc[:,1:].median(axis=1)

In [282]:
#importance_ex_gal,oof_ex_gal_total,preds_ex_gal,auc_class_exgal = galaxy_model(galaxy='ex_gal',pred=True)

In [53]:
train_ex_gal_id = train_ex_gal_id.reset_index()
train_gal_id = train_gal_id.reset_index()
test_ex_gal_id = test_ex_gal_id.reset_index()
test_gal_id = test_gal_id.reset_index()
del train_ex_gal_id['index'],train_gal_id['index'],test_ex_gal_id['index'],test_gal_id['index']

In [54]:
oof_gal = pd.merge(train_gal_id,oof_gal_total,left_index=True,right_index=True,how='left')
preds_gal = pd.merge(test_gal_id,preds_gal,left_index=True,right_index=True,how='left')
oof_ex_gal = pd.merge(train_ex_gal_id,oof_ex_gal_total,left_index=True,right_index=True,how='left')
preds_ex_gal = pd.merge(test_ex_gal_id,preds_ex_gal,left_index=True,right_index=True,how='left')

In [55]:
oof_result = oof_gal.append(oof_ex_gal,sort=True).fillna(0)
preds_result = preds_gal.append(preds_ex_gal,sort=True).fillna(0)

In [56]:
print(preds_result.shape)
print(oof_result.shape)

(3492890, 17)
(7848, 17)


In [57]:
preds_result.head(3)

Unnamed: 0,object_id,prob_med,prob_sum,target_15,target_16,target_42,target_52,target_53,target_6,target_62,target_64,target_65,target_67,target_88,target_90,target_92,target_95
0,168,0.004919,1.004749,0.0,0.998177,0.0,0.0,0.000801,0.00031,0.0,0.0,0.004919,0.0,0.0,0.0,0.000853,0.0
1,1063,0.025563,1.011062,0.0,0.000449,0.0,0.0,0.025563,0.006437,0.0,0.0,0.002186,0.0,0.0,0.0,0.982864,0.0
2,2270,0.001267,0.998139,0.0,0.000955,0.0,0.0,0.001128,0.004003,0.0,0.0,0.994789,0.0,0.0,0.0,0.001267,0.0


In [58]:
oof_result.to_pickle('../input/oof_result_20181217_v2.pickle')
preds_result.to_pickle('../input/preds_result_20181217_v2.pickle')