# 1. Importing Packages and Data 

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import os
import gc
import time
import pickle
import feather
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style="ticks", color_codes=True)
import matplotlib.pyplot as plt
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
import plotly.figure_factory as ff

In [9]:
def datatypes_pie(data):
    # Create a trace
    colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']
    trace1 = go.Pie(
        labels = ['float64','Int64'],
        values = data.dtypes.value_counts(),
        textfont=dict(size=20),
        marker=dict(colors=colors,line=dict(color='#000000', width=2)), hole = 0.45)
    layout = dict(title = "Data Types Count Percentage")
    data = [trace1]
    py.iplot(dict(data=data, layout=layout), filename='basic-line')

In [2]:
folderPath = 'D:/Competitions/Data/PLAsTiCC_Challengle'

In [7]:
#train = pd.read_csv(os.path.join(folderPath,'training_set.csv.zip'),compression='zip')
train_md = pd.read_csv(os.path.join(folderPath,'training_set_metadata.csv'))
train_md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7848 entries, 0 to 7847
Data columns (total 12 columns):
object_id             7848 non-null int64
ra                    7848 non-null float64
decl                  7848 non-null float64
gal_l                 7848 non-null float64
gal_b                 7848 non-null float64
ddf                   7848 non-null int64
hostgal_specz         7848 non-null float64
hostgal_photoz        7848 non-null float64
hostgal_photoz_err    7848 non-null float64
distmod               5523 non-null float64
mwebv                 7848 non-null float64
target                7848 non-null int64
dtypes: float64(9), int64(3)
memory usage: 735.8 KB


In [4]:
test_md = pd.read_csv(os.path.join(folderPath,'test_set_metadata.csv.zip'),compression='zip')
test_md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3492890 entries, 0 to 3492889
Data columns (total 11 columns):
object_id             int64
ra                    float64
decl                  float64
gal_l                 float64
gal_b                 float64
ddf                   int64
hostgal_specz         float64
hostgal_photoz        float64
hostgal_photoz_err    float64
distmod               float64
mwebv                 float64
dtypes: float64(9), int64(2)
memory usage: 293.1 MB


In [10]:
display(train_md.dtypes.value_counts())
datatypes_pie(train_md)

float64    9
int64      3
dtype: int64

# Model Training - Baseline 

In [8]:
import lightgbm as lgb

In [9]:
round_params = dict(num_boost_round = 21000,early_stopping_rounds = 100,verbose_eval = 50)
params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 14,
        'metric': 'multi_logloss',
        'learning_rate': 0.03,
        'subsample': .9,
        'colsample_bytree': .7,
        'reg_alpha': .01,
        'reg_lambda': .01,
        'min_split_gain': 0.01,
        'min_child_weight': 10,
        'n_estimators': 1000,
        'silent': -1,
        'verbose': -1,
        'max_depth': 3
    }

In [10]:
def lgb_cv_train(X, labels, X_test, params=params, round_params=round_params):
    print('X', X.shape, 'labels', labels.shape, 'X_test', X_test.shape)
    print('unique labels', np.unique(labels))
    
    labels2y = dict(map(reversed, enumerate(np.unique(labels))))
    y2labels = dict(enumerate(np.unique(labels)))
    y = np.array(list(map(labels2y.get, labels)))
    weight = np.array(list(map(labels2weight.get, labels)))
    
    params['num_class'] = len(np.unique(y))
    cv_raw = lgb.cv(params, lgb.Dataset(X, label=y, weight=weight), nfold=10, **round_params)
    best_round = np.argmin(cv_raw['multi_logloss-mean'])
    best_score = cv_raw['multi_logloss-mean'][best_round]
    print(f'best_round: {best_round}', f'best_score: {best_score}')
    model = lgb.train(
        params, 
        lgb.Dataset(X, label=y, weight=weight), 
        num_boost_round=best_round, 
    )
    pred = model.predict(X_test)
    pred_labels = pd.DataFrame(
        {f'class_{c}': pred[:, i] for i,c in enumerate(np.unique(labels))}
    )
    res = dict(
        model=model,
        best_round=best_round,
        best_score=best_score,
        pred_labels=pred_labels
    )
    return res

In [11]:
feat_extra_li = ['hostgal_specz', 'hostgal_photoz', 'hostgal_photoz_err', 'distmod']
feat_gal_cols = ['ra', 'decl', 'gal_l', 'gal_b', 'ddf', 'mwebv']
feat_extra_cols = feat_gal_cols + feat_extra_li
print(feat_gal_cols)
print(feat_extra_cols)

['ra', 'decl', 'gal_l', 'gal_b', 'ddf', 'mwebv']
['ra', 'decl', 'gal_l', 'gal_b', 'ddf', 'mwebv', 'hostgal_specz', 'hostgal_photoz', 'hostgal_photoz_err', 'distmod']


In [12]:
target = train_md['target'].values.copy()
del train_md['target']

In [13]:
train_ids = train_md['object_id'].copy()
test_ids = test_md['object_id'].copy()
del train_md['object_id'], test_md['object_id'];

In [15]:
train_mask = train_md['distmod'].isnull().values
test_mask = test_md['distmod'].isnull().values

In [16]:
labels2weight = {x:1 for x in np.unique(target)}
labels2weight[64] = 2
labels2weight[15] = 2

In [17]:
print("Unique Target from Train: ",np.unique(target[train_mask]))
print("Unique Target From Target:", np.unique(target[~train_mask]))

Unique Target from Train:  [ 6 16 53 65 92]
Unique Target From Target: [15 42 52 62 64 67 88 90 95]


In [18]:
%%time
res_gal = lgb_cv_train(train_md.loc[train_mask, feat_gal_cols], target[train_mask], test_md.loc[test_mask, feat_gal_cols])

X (2325, 6) labels (2325,) X_test (390510, 6)
unique labels [ 6 16 53 65 92]



Found `n_estimators` in params. Will use it instead of argument


silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.



[50]	cv_agg's multi_logloss: 1.22702 + 0.00655265
[100]	cv_agg's multi_logloss: 1.15984 + 0.010371
[150]	cv_agg's multi_logloss: 1.14673 + 0.0134734
[200]	cv_agg's multi_logloss: 1.14612 + 0.0155465
[250]	cv_agg's multi_logloss: 1.14862 + 0.0173029
best_round: 184 best_score: 1.1456555484785917



Found `n_estimators` in params. Will use it instead of argument


silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.



Wall time: 1min 25s


In [19]:
res_gal['pred_labels'].head()

Unnamed: 0,class_6,class_16,class_53,class_65,class_92
0,0.050653,0.358272,0.004743,0.468139,0.118193
1,0.016306,0.222888,0.002666,0.497463,0.260677
2,0.003009,0.358363,0.002817,0.580541,0.055271
3,0.020338,0.278121,0.004092,0.477093,0.220356
4,0.004868,0.356629,0.005579,0.537041,0.095883


In [20]:
n_gal = res_gal['pred_labels'].shape[1]
res_gal['pred_labels'] = res_gal['pred_labels'] * n_gal/(n_gal+1)
res_gal['pred_labels']['class_99'] = 1/(n_gal+1)
res_gal['pred_labels'].head()

Unnamed: 0,class_6,class_16,class_53,class_65,class_92,class_99
0,0.042211,0.29856,0.003952,0.390116,0.098494,0.166667
1,0.013589,0.18574,0.002221,0.414553,0.217231,0.166667
2,0.002508,0.298635,0.002347,0.483784,0.046059,0.166667
3,0.016948,0.231768,0.00341,0.397577,0.18363,0.166667
4,0.004057,0.29719,0.004649,0.447534,0.079903,0.166667


In [21]:
%%time
res_extra = lgb_cv_train(train_md.loc[~train_mask, feat_extra_cols],target[~train_mask],test_md.loc[~test_mask, feat_extra_cols])

X (5523, 10) labels (5523,) X_test (3102380, 10)
unique labels [15 42 52 62 64 67 88 90 95]



Found `n_estimators` in params. Will use it instead of argument


silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.



[50]	cv_agg's multi_logloss: 1.60892 + 0.0169808
[100]	cv_agg's multi_logloss: 1.49112 + 0.0215679
[150]	cv_agg's multi_logloss: 1.45761 + 0.0232759
[200]	cv_agg's multi_logloss: 1.44775 + 0.0246379
[250]	cv_agg's multi_logloss: 1.4465 + 0.0250768
[300]	cv_agg's multi_logloss: 1.44812 + 0.0258355
best_round: 238 best_score: 1.446296023690325



Found `n_estimators` in params. Will use it instead of argument


silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.



Wall time: 43min 53s


In [22]:
res_extra['pred_labels'].head()

Unnamed: 0,class_15,class_42,class_52,class_62,class_64,class_67,class_88,class_90,class_95
0,0.003672,0.132766,0.056299,0.081067,0.000211,0.029766,0.044981,0.650229,0.001011
1,0.009989,0.563239,0.047761,0.086829,0.065165,0.01942,0.016839,0.188272,0.002485
2,0.023829,0.65262,0.028444,0.09247,0.034142,0.039227,0.00404,0.123763,0.001465
3,0.004037,0.505938,0.044791,0.088047,0.10755,0.028351,0.014922,0.203652,0.002714
4,0.01557,0.06847,0.006403,0.020943,0.000193,0.001728,0.118497,0.763795,0.004401


In [23]:
n_extra = res_extra['pred_labels'].shape[1]
res_extra['pred_labels'] = res_extra['pred_labels'] * n_extra/(n_extra+1)
res_extra['pred_labels']['class_99'] = 1/(n_extra+1)
res_extra['pred_labels'].head()

Unnamed: 0,class_15,class_42,class_52,class_62,class_64,class_67,class_88,class_90,class_95,class_99
0,0.003304,0.119489,0.050669,0.07296,0.000189,0.026789,0.040483,0.585206,0.00091,0.1
1,0.00899,0.506915,0.042985,0.078146,0.058649,0.017478,0.015155,0.169445,0.002236,0.1
2,0.021446,0.587358,0.025599,0.083223,0.030727,0.035304,0.003636,0.111387,0.001319,0.1
3,0.003633,0.455344,0.040311,0.079242,0.096795,0.025516,0.013429,0.183286,0.002442,0.1
4,0.014013,0.061623,0.005762,0.018849,0.000174,0.001555,0.106647,0.687416,0.003961,0.1


# Submission

In [24]:
sub = pd.read_csv(os.path.join(folderPath,'sample_submission.csv.zip'),compression='zip')
sub = sub.set_index('object_id')
sub[:] = 0
sub.head()

Unnamed: 0_level_0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
classnames = sub.columns.tolist()
print(sub.shape, classnames)

(3492890, 15) ['class_6', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53', 'class_62', 'class_64', 'class_65', 'class_67', 'class_88', 'class_90', 'class_92', 'class_95', 'class_99']


In [26]:
for c in res_gal['pred_labels'].columns:
    sub.loc[test_mask, c] = res_gal['pred_labels'][c].values
for c in res_extra['pred_labels'].columns:
    sub.loc[~test_mask, c] = res_extra['pred_labels'][c].values

In [27]:
sub.tail(10)

Unnamed: 0_level_0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
130787887,0.0,0.199151,0.0,0.218913,0.032456,0.0,0.093222,0.228664,0.0,0.041797,0.006441,0.07843,0.0,0.000926,0.1
130787903,0.031559,0.0,0.353778,0.0,0.0,0.001357,0.0,0.0,0.412423,0.0,0.0,0.0,0.034216,0.0,0.166667
130787932,0.022478,0.0,0.267425,0.0,0.0,0.001672,0.0,0.0,0.500783,0.0,0.0,0.0,0.040975,0.0,0.166667
130787944,0.0,0.278267,0.0,0.295964,0.017301,0.0,0.042667,0.122434,0.0,0.024159,0.004888,0.111765,0.0,0.002556,0.1
130787965,0.0,0.121882,0.0,0.472411,0.015218,0.0,0.04727,0.170633,0.0,0.021169,0.0037,0.047036,0.0,0.000681,0.1
130787966,0.0,0.307432,0.0,0.220577,0.028797,0.0,0.073265,0.130297,0.0,0.041109,0.007663,0.089522,0.0,0.001338,0.1
130787971,0.0,0.211535,0.0,0.196219,0.018243,0.0,0.037605,0.278986,0.0,0.036577,0.002121,0.11749,0.0,0.001224,0.1
130787974,0.0,0.293623,0.0,0.235525,0.02421,0.0,0.0345,0.235244,0.0,0.018252,0.001054,0.056546,0.0,0.001046,0.1
130788053,0.0,0.119943,0.0,0.331416,0.05425,0.0,0.107389,0.196294,0.0,0.009841,0.002217,0.078067,0.0,0.000584,0.1
130788054,0.0,0.13419,0.0,0.444102,0.007995,0.0,0.101366,0.127254,0.0,0.030682,0.003207,0.050435,0.0,0.00077,0.1


In [28]:
%%time
score = res_gal['best_score'] * (train_mask).sum()/train_md.shape[0]
score+= res_extra['best_score'] * (~train_mask).sum()/train_md.shape[0]
sub.reset_index().to_csv(os.path.join(folderPath,f'meta_lgb_{score}.csv'), index=False, float_format='%.6f')

Wall time: 2min 29s
