# Import

In [1]:
from utils import init_logger, timer, fix_seed, reduce_mem_usage
import random
import pandas as pd
pd.set_option('display.max_columns', 150)
import numpy as np
import category_encoders as ce
import matplotlib.pyplot as plt
import datetime as dt
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas(desc="Processing:")

fix_seed()
logger = init_logger()

In [3]:
PROJECT_NAME = "v1"

# Load

In [2]:
from itertools import combinations


def load_data():
    df_train = pd.read_csv('../input/train.csv')
    df_test = pd.read_csv('../input/test.csv')
    df = pd.concat([df_train, df_test])
    
    df["Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"]**2 + df["Vertical_Distance_To_Hydrology"]**2
    df["diff_roadways_and_hydrology"] = df["Horizontal_Distance_To_Roadways"] - df["Horizontal_Distance_To_Hydrology"]
    
    hillshade_columns = [
        "Hillshade_9am",
        "Hillshade_Noon",
        "Hillshade_3pm"
    ]
    df["sum_Hillshade"] = df[hillshade_columns].sum(axis=1)
    for col0, col1 in combinations(hillshade_columns, 2):
        df[f"diff_{col0}_{col1}"] = df[col0] - df[col1]
    
    wilderness_columns = [f"Wilderness_Area{i}" for i in range(1, 5)]
    df["sum_Wilderness"] = df[wilderness_columns].sum(axis=1)
    
    df["class_Wilderness"] = 0
    for i, col in enumerate(wilderness_columns):
        df["class_Wilderness"] += df[col] * 2**i
    
    soil_columns = [f"Soil_Type{i}" for i in range(1, 41)]
    df["sum_Soil"] = df[soil_columns].sum(axis=1)
    
    important_soil_columns = [f"Soil_Type{i}" for i in (11, 13, 22, 23, 24, 31, 32, 33, 35, 36, 37, 38, 39, 40)]
    df["important_sum_Soil"] = df[important_soil_columns].sum(axis=1)
    
    df["class_Soil_Type"] = 0
    for i, col in enumerate(important_soil_columns):
        df["class_Soil_Type"] += df[col] * 2**i
    
    df.drop("Soil_Type7", axis=1, inplace=True)
    df.drop("Soil_Type15", axis=1, inplace=True)
    
    df_train, df_test = df[:len(df_train)], df[len(df_train):]
    
    return df_train, df_test

In [5]:
with timer("read csv", logger):
    df_train, df_test = load_data()

2022/01/01 19:56:15 45 [INFO] [read csv] start.
2022/01/01 19:56:25 47 [INFO] [read csv] done in 9.841 seconds.


In [7]:
X_train = df_train.drop(["Id", "Cover_Type"], axis=1)
y_train = df_train["Cover_Type"] - 1
X_test = df_test.drop(["Id", "Cover_Type"], axis=1)

In [23]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Mem. usage decreased to 339.51 Mb (82.6% reduction)
Mem. usage decreased to 84.88 Mb (82.6% reduction)


# Training

In [24]:
from model_lgb import ModelLGB
from runner import Runner
from hyperopt import hp, fmin, tpe, space_eval

In [25]:
runner = Runner(PROJECT_NAME, ModelLGB, cv=False)

In [26]:
space = {
    "objective": "multiclass",
    "num_classes": 7,
    "metric": ["multi_logloss", "multi_error"],
    "max_leaves": 30 + hp.randint("_max_leaves", 40),
    "min_data_in_leaf": 10 + hp.randint("_min_data_in_leaf", 40),
    "lambda_l1": hp.loguniform("lambda_l1", np.log(0.01), np.log(1)),
    "lambda_l2": hp.loguniform("lambda_l2", np.log(0.01), np.log(1)),
    "bagging_fraction": hp.uniform("bagging_fraction", 0.7, 0.9),
    "feature_fraction": hp.uniform("feature_fraction", 0.7, 0.9),
    "learning_rate": 0.01,
    "seed": 0,
    "verbose": -1,
}

params = {
    'bagging_fraction': 0.8691099275893169, 
    'feature_fraction': 0.9433525231687435, 
    'lambda_l1': 0.057908447715479115, 
    'lambda_l2': 0.01474021398017653, 
    'learning_rate': 0.01, 
    'max_leaves': 48, 
    'metric': ('multi_logloss', 'multi_error'), 
    'min_data_in_leaf': 26, 
    'objective': 'multiclass',
    "num_classes": 7,
    'seed': 0, 
    'verbose': -1
}

train_params = {
    "num_boost_round": 2000,
    "early_stopping_rounds": 20,
    "verbose_eval": 50,
}

In [10]:
def objective(args):
    logger.info(args)
    runner.train(X, y, args, train_params)
    return runner.get_score()

In [11]:
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=20)
best_params = space_eval(space, best)
logger.info("best params: {}".format(best_params))

  0%|                                    | 0/20 [00:00<?, ?trial/s, best loss=?]

2021/12/31 14:33:48 2 [INFO] {'bagging_fraction': 0.8041882760340158, 'feature_fraction': 0.758404881916913, 'lambda_l1': 0.8732061538314787, 'lambda_l2': 0.02853921475938039, 'learning_rate': 0.01, 'max_leaves': 33, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 41, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.45077	training's multi_error: 0.0803313	valid_1's multi_logloss: 0.45129	valid_1's multi_error: 0.0805225
[100]	training's multi_logloss: 0.291398	training's multi_error: 0.0571466	valid_1's multi_logloss: 0.292148	valid_1's multi_error: 0.0572413
[150]	training's multi_logloss: 0.209781	training's multi_error: 0.0523722	valid_1's multi_logloss: 0.210703	valid_1's multi_error: 0.052405
[200]	training's multi_logloss: 0.166474	training's multi_error: 0.0488456	valid_1's multi_logloss: 0.167547	valid_1's multi_error: 0.0490913
[250]	training's multi_logloss: 0.141748	training's multi_error: 0.0466009	valid_1's multi_logloss: 0.142972	valid_1's multi_error: 0.0470013
[300]	training's multi_logloss: 0.12632	training's multi_error: 0.04482	valid_1's multi_logloss: 0.127683	valid_1's multi_error: 0.0453975
[350]	training's multi_logloss: 0.115918	training's multi_error: 0.0433262

2021/12/31 14:46:45 2 [INFO] {'bagging_fraction': 0.8309292802869729, 'feature_fraction': 0.8493136822063353, 'lambda_l1': 0.012037178705749304, 'lambda_l2': 0.018650660926638262, 'learning_rate': 0.01, 'max_leaves': 46, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 19, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.419123	training's multi_error: 0.0672559	valid_1's multi_logloss: 0.420059	valid_1's multi_error: 0.0676675
[100]	training's multi_logloss: 0.25902	training's multi_error: 0.0535016	valid_1's multi_logloss: 0.260289	valid_1's multi_error: 0.0536575
[150]	training's multi_logloss: 0.184407	training's multi_error: 0.0485353	valid_1's multi_logloss: 0.185946	valid_1's multi_error: 0.0489625
[200]	training's multi_logloss: 0.146783	training's multi_error: 0.0458656	valid_1's multi_logloss: 0.148511	valid_1's multi_error: 0.0464287
[250]	training's multi_logloss: 0.124943	training's multi_error: 0.0439166	valid_1's multi_logloss: 0.12689	valid_1's multi_error: 0.0445912
[300]	training's multi_logloss: 0.112358	training's multi_error: 0.0423316	valid_1's multi_logloss: 0.11449	valid_1's multi_error: 0.0431487
[350]	training's multi_logloss: 0.104379	training's multi_error: 0.0411

2021/12/31 14:56:20 2 [INFO] {'bagging_fraction': 0.8299037171131212, 'feature_fraction': 0.803399637322971, 'lambda_l1': 0.0626719383891622, 'lambda_l2': 0.014493318498254155, 'learning_rate': 0.01, 'max_leaves': 53, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 26, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.424079	training's multi_error: 0.0668431	valid_1's multi_logloss: 0.425126	valid_1's multi_error: 0.0673437
[100]	training's multi_logloss: 0.264431	training's multi_error: 0.0523875	valid_1's multi_logloss: 0.265829	valid_1's multi_error: 0.0526713
[150]	training's multi_logloss: 0.187529	training's multi_error: 0.0476209	valid_1's multi_logloss: 0.189216	valid_1's multi_error: 0.04817
[200]	training's multi_logloss: 0.148103	training's multi_error: 0.0450788	valid_1's multi_logloss: 0.14999	valid_1's multi_error: 0.0457687
[250]	training's multi_logloss: 0.125608	training's multi_error: 0.0431475	valid_1's multi_logloss: 0.127715	valid_1's multi_error: 0.0439288
[300]	training's multi_logloss: 0.11206	training's multi_error: 0.0417019	valid_1's multi_logloss: 0.114342	valid_1's multi_error: 0.0425775
[350]	training's multi_logloss: 0.103565	training's multi_error: 0.04059

2021/12/31 15:07:17 2 [INFO] {'bagging_fraction': 0.8989200326060357, 'feature_fraction': 0.86436856562306, 'lambda_l1': 0.4897384544924264, 'lambda_l2': 0.011468634268358415, 'learning_rate': 0.01, 'max_leaves': 34, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 40, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.425145	training's multi_error: 0.0708938	valid_1's multi_logloss: 0.425904	valid_1's multi_error: 0.0712462
[100]	training's multi_logloss: 0.266259	training's multi_error: 0.0565116	valid_1's multi_logloss: 0.267291	valid_1's multi_error: 0.0566038
[150]	training's multi_logloss: 0.191915	training's multi_error: 0.0511988	valid_1's multi_logloss: 0.193168	valid_1's multi_error: 0.0513612
[200]	training's multi_logloss: 0.154016	training's multi_error: 0.0480156	valid_1's multi_logloss: 0.155449	valid_1's multi_error: 0.048395
[250]	training's multi_logloss: 0.132006	training's multi_error: 0.0458037	valid_1's multi_logloss: 0.133624	valid_1's multi_error: 0.0464338
[300]	training's multi_logloss: 0.118979	training's multi_error: 0.0440994	valid_1's multi_logloss: 0.120767	valid_1's multi_error: 0.044715
[350]	training's multi_logloss: 0.110536	training's multi_error: 0.042

2021/12/31 15:17:16 2 [INFO] {'bagging_fraction': 0.7068361964966817, 'feature_fraction': 0.7116335119363578, 'lambda_l1': 0.09513094301468064, 'lambda_l2': 0.524104595381781, 'learning_rate': 0.01, 'max_leaves': 52, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 41, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.454084	training's multi_error: 0.0788222	valid_1's multi_logloss: 0.454689	valid_1's multi_error: 0.0792237
[100]	training's multi_logloss: 0.294272	training's multi_error: 0.0541347	valid_1's multi_logloss: 0.295186	valid_1's multi_error: 0.0543525
[150]	training's multi_logloss: 0.208092	training's multi_error: 0.048985	valid_1's multi_logloss: 0.209248	valid_1's multi_error: 0.0494125
[200]	training's multi_logloss: 0.162704	training's multi_error: 0.0458238	valid_1's multi_logloss: 0.164076	valid_1's multi_error: 0.0464613
[250]	training's multi_logloss: 0.137146	training's multi_error: 0.0438575	valid_1's multi_logloss: 0.138687	valid_1's multi_error: 0.0445888
[300]	training's multi_logloss: 0.120831	training's multi_error: 0.0423044	valid_1's multi_logloss: 0.122563	valid_1's multi_error: 0.0431038
[350]	training's multi_logloss: 0.110143	training's multi_error: 0.04

2021/12/31 15:30:12 2 [INFO] {'bagging_fraction': 0.8197814508193194, 'feature_fraction': 0.7634908800588147, 'lambda_l1': 0.13532178103709736, 'lambda_l2': 0.21595416946835802, 'learning_rate': 0.01, 'max_leaves': 69, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 31, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.438482	training's multi_error: 0.0708359	valid_1's multi_logloss: 0.43934	valid_1's multi_error: 0.0713088
[100]	training's multi_logloss: 0.276366	training's multi_error: 0.0508994	valid_1's multi_logloss: 0.277597	valid_1's multi_error: 0.0512475
[150]	training's multi_logloss: 0.193837	training's multi_error: 0.046145	valid_1's multi_logloss: 0.195396	valid_1's multi_error: 0.0468638
[200]	training's multi_logloss: 0.150607	training's multi_error: 0.0436288	valid_1's multi_logloss: 0.152423	valid_1's multi_error: 0.0443363
[250]	training's multi_logloss: 0.126558	training's multi_error: 0.0418891	valid_1's multi_logloss: 0.128602	valid_1's multi_error: 0.0427825
[300]	training's multi_logloss: 0.111869	training's multi_error: 0.0407081	valid_1's multi_logloss: 0.114136	valid_1's multi_error: 0.0416238
[350]	training's multi_logloss: 0.102321	training's multi_error: 0.039

2021/12/31 15:42:37 2 [INFO] {'bagging_fraction': 0.8752568813438453, 'feature_fraction': 0.7759201601499417, 'lambda_l1': 0.06838213507022683, 'lambda_l2': 0.19405602740685965, 'learning_rate': 0.01, 'max_leaves': 59, 'metric': ('multi_logloss', 'multi_error'), 'min_data_in_leaf': 47, 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}





Training until validation scores don't improve for 20 rounds                    
[50]	training's multi_logloss: 0.433631	training's multi_error: 0.0714975	valid_1's multi_logloss: 0.434376	valid_1's multi_error: 0.07185
[100]	training's multi_logloss: 0.272359	training's multi_error: 0.0519913	valid_1's multi_logloss: 0.273444	valid_1's multi_error: 0.0523363
[150]	training's multi_logloss: 0.192102	training's multi_error: 0.0471216	valid_1's multi_logloss: 0.193483	valid_1's multi_error: 0.0477038
[200]	training's multi_logloss: 0.15066	training's multi_error: 0.0444141	valid_1's multi_logloss: 0.152279	valid_1's multi_error: 0.0451587
[250]	training's multi_logloss: 0.127133	training's multi_error: 0.0425897	valid_1's multi_logloss: 0.128971	valid_1's multi_error: 0.0434037
[300]	training's multi_logloss: 0.112867	training's multi_error: 0.0413022	valid_1's multi_logloss: 0.114904	valid_1's multi_error: 0.0422125
[350]	training's multi_logloss: 0.103675	training's multi_error: 0.0402

KeyboardInterrupt: 

# Inference

In [27]:
runner = Runner(PROJECT_NAME, ModelLGB, cv=False)
runner.train(X_train, y_train, params, train_params)



Training until validation scores don't improve for 20 rounds
[50]	training's multi_logloss: 0.405179	training's multi_error: 0.0626447	valid_1's multi_logloss: 0.406004	valid_1's multi_error: 0.062955
[100]	training's multi_logloss: 0.247238	training's multi_error: 0.0526416	valid_1's multi_logloss: 0.248429	valid_1's multi_error: 0.052965
[150]	training's multi_logloss: 0.174903	training's multi_error: 0.0480084	valid_1's multi_logloss: 0.176429	valid_1's multi_error: 0.04846
[200]	training's multi_logloss: 0.139593	training's multi_error: 0.0454534	valid_1's multi_logloss: 0.141366	valid_1's multi_error: 0.0460663
[250]	training's multi_logloss: 0.120175	training's multi_error: 0.0434544	valid_1's multi_logloss: 0.122174	valid_1's multi_error: 0.04424
[300]	training's multi_logloss: 0.109027	training's multi_error: 0.0419037	valid_1's multi_logloss: 0.111231	valid_1's multi_error: 0.04279


KeyboardInterrupt: 

In [18]:
runner.model.feature_importance_

Elevation                             2.014181e+08
Horizontal_Distance_To_Fire_Points    1.101882e+07
class_Wilderness                      1.071024e+07
Horizontal_Distance_To_Roadways       8.949070e+06
diff_roadways_and_hydrology           8.404024e+06
                                          ...     
Soil_Type8                            2.526067e+03
Soil_Type24                           1.623509e+03
Soil_Type31                           9.902597e+02
Soil_Type36                           6.210947e+02
Soil_Type37                           5.131047e+02
Length: 63, dtype: float64

In [19]:
with timer("prediction", logger):
    prob = runner.predict(X_test)
prob

2022/01/01 20:16:30 45 [INFO] [prediction] start.
2022/01/01 20:17:59 47 [INFO] [prediction] done in 89.642 seconds.


array([[1.73290468e-05, 9.99924988e-01, 5.57017763e-05, ...,
        3.20607796e-09, 1.49525885e-06, 4.78405098e-07],
       [1.24256746e-02, 9.87516863e-01, 3.79437246e-05, ...,
        2.93377036e-08, 1.50154787e-05, 4.41724903e-06],
       [1.72194133e-03, 9.98270213e-01, 6.53144235e-06, ...,
        6.06677518e-09, 2.60305567e-07, 1.04040315e-06],
       ...,
       [7.08213448e-05, 9.99846440e-01, 7.69965966e-05, ...,
        6.34173483e-09, 4.35646054e-06, 1.37344717e-06],
       [9.99407545e-01, 3.63570410e-04, 2.45181337e-06, ...,
        9.53347746e-09, 4.69723113e-07, 2.25940174e-04],
       [1.13394273e-04, 1.60790018e-01, 8.38888754e-01, ...,
        2.99428124e-08, 2.02644105e-04, 5.12790331e-06]])

In [20]:
res = pd.DataFrame()
res["Id"] = df_test["Id"]
res["Cover_Type"] = prob.argmax(axis=1) + 1
res.to_csv(f"../submission/submission_{PROJECT_NAME}.csv", index=False)
res

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2
...,...,...
999995,4999995,2
999996,4999996,1
999997,4999997,2
999998,4999998,1


In [21]:
!kaggle competitions submit tabular-playground-series-dec-2021 -f ../submission/submission_{PROJECT_NAME}.csv -m "valid_1's multi_error: 0.0439975"

100%|██████████████████████████████████████| 9.54M/9.54M [00:08<00:00, 1.24MB/s]
Successfully submitted to Tabular Playground Series - Dec 2021