# Kaggle

In [20]:
import os
import json

with open("/Users/g-ogaki/workspace/kaggle.json") as f:
    json_data = json.load(f) 
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]

In [5]:
!kaggle competitions download -c tabular-playground-series-dec-2021

Downloading tabular-playground-series-dec-2021.zip to /Users/g-ogaki/workspace/tabular-playground-series-dec-2021
100%|████████████████████████████████████████| 126M/126M [00:57<00:00, 2.36MB/s]
100%|████████████████████████████████████████| 126M/126M [00:57<00:00, 2.31MB/s]


In [6]:
!unzip tabular-playground-series-dec-2021.zip

Archive:  tabular-playground-series-dec-2021.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Import

In [14]:
from utils import init_logger, timer, fix_seed
import random
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import numpy as np
import category_encoders as ce
import matplotlib.pyplot as plt
import datetime as dt
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas(desc="Processing:")

fix_seed()
logger = init_logger()

In [2]:
PROJECT_NAME = "v1"

# Load

In [3]:
from etl import load_train_data, load_test_data

In [5]:
with timer("read csv", logger):
    df = load_train_data()
    df_test = load_test_data()

2021/12/30 15:56:21 45 [INFO] [read csv] start.
2021/12/30 15:56:29 47 [INFO] [read csv] done in 8.007 seconds.


In [6]:
X = df.drop(["Id", "Cover_Type"], axis=1)
y = df["Cover_Type"] - 1
X_test = df_test.drop(["Id"], axis=1)

# Training

In [7]:
from model_lgb import ModelLGB
from runner import Runner
from sklearn.model_selection import ParameterGrid

In [8]:
runner = Runner(PROJECT_NAME, ModelLGB, cv=False)

In [24]:
all_params = {
    "objective": ["multiclass"],
    "num_classes": [7],
    "metric": [["multi_logloss", "multi_error"]],
    "max_leaves": [32],
    # "min_data_in_leaf": [20],
    "lambda_l1": [0.01],
    "lambda_l2": [0.01],
    "bagging_fraction": [0.8],
    "feature_fraction": [0.8],
    "learning_rate": [0.01],
    "seed": [0],
    # "device": ["gpu"],
    "verbose": [-1],
}

train_params = {
    "num_boost_round": 2000,
    "early_stopping_rounds": 20,
    "verbose_eval": 10,
}

In [25]:
min_logloss = np.inf
best_params = None

for params in tqdm(list(ParameterGrid(all_params))):
    logger.info('params: {}'.format(params))
    with timer("train", logger):
        runner.train(X, y, params, train_params)
    
    if min_logloss > runner.get_score():
        min_logloss = runner.get_score()
        best_params = params

logger.info("best params: {}".format(best_params))
logger.info("min logloss: {}".format(min_logloss))

  0%|                                                     | 0/1 [00:00<?, ?it/s]2021/12/30 16:29:08 5 [INFO] params: {'bagging_fraction': 0.8, 'device': 'gpu', 'feature_fraction': 0.8, 'lambda_l1': 0.01, 'lambda_l2': 0.01, 'learning_rate': 0.01, 'max_leaves': 32, 'metric': ['multi_logloss', 'multi_error'], 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}
2021/12/30 16:29:08 5 [INFO] params: {'bagging_fraction': 0.8, 'device': 'gpu', 'feature_fraction': 0.8, 'lambda_l1': 0.01, 'lambda_l2': 0.01, 'learning_rate': 0.01, 'max_leaves': 32, 'metric': ['multi_logloss', 'multi_error'], 'num_classes': 7, 'objective': 'multiclass', 'seed': 0, 'verbose': -1}
2021/12/30 16:29:08 45 [INFO] [train] start.
2021/12/30 16:29:08 45 [INFO] [train] start.
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
  0%|                                                     | 0/1 [00:01<?, ?it/s]


LightGBMError: GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1

# Inference

In [50]:
runner.train(X, y, best_params, train_params)



Training until validation scores don't improve for 20 rounds
[10]	training's multi_logloss: 0.762925	training's multi_error: 0.434439	valid_1's multi_logloss: 0.76362	valid_1's multi_error: 0.43446
[20]	training's multi_logloss: 0.654778	training's multi_error: 0.123933	valid_1's multi_logloss: 0.655675	valid_1's multi_error: 0.12414
[30]	training's multi_logloss: 0.570386	training's multi_error: 0.109025	valid_1's multi_logloss: 0.571545	valid_1's multi_error: 0.109184
Did not meet early stopping. Best iteration is:
[30]	training's multi_logloss: 0.570386	training's multi_error: 0.109025	valid_1's multi_logloss: 0.571545	valid_1's multi_error: 0.109184


In [15]:
runner.model.feature_importance_

Elevation                             1.976171e+08
Horizontal_Distance_To_Fire_Points    1.061060e+07
Horizontal_Distance_To_Roadways       7.925491e+06
diff_roadways_and_hydrology           7.347516e+06
important_sum_Soil                    7.153431e+06
Wilderness_Area3                      6.636899e+06
sum_Soil                              6.409293e+06
Vertical_Distance_To_Hydrology        5.275033e+06
Wilderness_Area1                      4.787364e+06
Wilderness_Area4                      3.187770e+06
Distance_To_Hydrology                 1.921317e+06
Soil_Type39                           1.347918e+06
Soil_Type2                            1.276935e+06
Soil_Type10                           1.017365e+06
Soil_Type38                           7.452598e+05
Soil_Type4                            6.945628e+05
Soil_Type22                           6.789970e+05
Soil_Type40                           6.516944e+05
Horizontal_Distance_To_Hydrology      4.445898e+05
sum_Wilderness                 

In [17]:
with timer("prediction", logger):
    prob = runner.predict(X_test)
prob

2021/12/30 16:10:56 45 [INFO] [prediction] start.
2021/12/30 16:10:56 45 [INFO] [prediction] start.
2021/12/30 16:12:03 47 [INFO] [prediction] done in 66.348 seconds.
2021/12/30 16:12:03 47 [INFO] [prediction] done in 66.348 seconds.


array([[3.93715919e-05, 9.99796136e-01, 1.62012184e-04, ...,
        6.90062376e-10, 1.89302521e-06, 5.83846665e-07],
       [2.15227615e-02, 9.78387624e-01, 6.81942141e-05, ...,
        5.43440402e-09, 1.62646146e-05, 5.12172515e-06],
       [3.50797441e-03, 9.96476102e-01, 1.40640854e-05, ...,
        1.47013979e-09, 2.94706350e-07, 1.54976759e-06],
       ...,
       [1.51037128e-04, 9.99677108e-01, 1.67029192e-04, ...,
        1.35512144e-09, 2.83695885e-06, 1.97972634e-06],
       [9.98934385e-01, 6.40597365e-04, 2.37210027e-06, ...,
        1.90524047e-09, 3.90790952e-07, 4.22244574e-04],
       [1.90964695e-04, 1.73646438e-01, 8.25992067e-01, ...,
        5.09042531e-09, 1.65403162e-04, 5.10175026e-06]])

In [18]:
res = pd.DataFrame()
res["Id"] = df_test["Id"]
res["Cover_Type"] = prob.argmax(axis=1) + 1
res.to_csv(f"../submission/submission_{PROJECT_NAME}.csv", index=False)
res

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2
...,...,...
999995,4999995,2
999996,4999996,1
999997,4999997,2
999998,4999998,1


In [21]:
!kaggle competitions submit tabular-playground-series-dec-2021 -f ../submission/submission_{PROJECT_NAME}.csv -m "valid_1's multi_error: 0.0390737"

100%|███████████████████████████████████████| 9.54M/9.54M [00:14<00:00, 713kB/s]
Successfully submitted to Tabular Playground Series - Dec 2021