# import

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)
import numpy as np
from utils import *
from itertools import combinations

In [2]:
PROJECT_NAME = "v3"

# load

In [3]:
fix_seed(42)
logger = init_logger()

In [4]:
with timer("load", logger):
    df_train = pd.read_csv("../input/train.csv")
    df_test = pd.read_csv("../input/test.csv")

df = pd.concat([df_train, df_test])

2022/01/02 14:13:06 46 [INFO] [load] start.
2022/01/02 14:13:11 48 [INFO] [load] done in 5.313 seconds.


# processing

In [5]:
drop_columns = [
    "Soil_Type7",
    "Soil_Type15"
]

df.drop(drop_columns, axis=1, inplace=True)

In [6]:
# aspect should be in [0, 360), meanwhile it is in (-360, 720)
df.loc[df["Aspect"] < 0, "Aspect"] += 360
df.loc[df["Aspect"] >= 360, "Aspect"] -= 360

In [7]:
df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,0,3189,40,8,30,13,3270,206,234,193,4873,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0
1,1,3026,182,5,280,29,3270,233,240,106,5423,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0
2,2,3106,13,7,351,37,2914,208,234,137,5269,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0
3,3,3022,276,13,192,16,3034,207,238,156,2866,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0
4,4,2906,186,13,266,22,2916,231,231,154,2642,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0


In [8]:
df["L1_Distance_To_Hydrology"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
df["L2_Distance_To_Hydrology"] = (df["Horizontal_Distance_To_Hydrology"]**2 + df["Vertical_Distance_To_Hydrology"]**2) ** .5
df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,L1_Distance_To_Hydrology,L2_Distance_To_Hydrology
0,0,3189,40,8,30,13,3270,206,234,193,4873,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,43,32.695565
1,1,3026,182,5,280,29,3270,233,240,106,5423,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,309,281.49778
2,2,3106,13,7,351,37,2914,208,234,137,5269,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,388,352.944755
3,3,3022,276,13,192,16,3034,207,238,156,2866,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,208,192.665513
4,4,2906,186,13,266,22,2916,231,231,154,2642,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,288,266.908224


In [9]:
wilderness_columns = [column for column in df if column.startswith("Wilderness")]
df["Count_Wilderness"] = df[wilderness_columns].sum(axis=1)

In [10]:
soil_columns = [column for column in df if column.startswith("Soil_Type")]
df["Count_Soil"] = df[soil_columns].sum(axis=1)

In [11]:
df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,L1_Distance_To_Hydrology,L2_Distance_To_Hydrology,Count_Wilderness,Count_Soil
0,0,3189,40,8,30,13,3270,206,234,193,4873,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,43,32.695565,1,1
1,1,3026,182,5,280,29,3270,233,240,106,5423,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,309,281.49778,1,1
2,2,3106,13,7,351,37,2914,208,234,137,5269,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,388,352.944755,1,1
3,3,3022,276,13,192,16,3034,207,238,156,2866,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,208,192.665513,1,1
4,4,2906,186,13,266,22,2916,231,231,154,2642,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,288,266.908224,1,1


In [12]:
# hillshade shoud be in [0, 255]

hillshade_columns = [column for column in df if column.startswith("Hillshade")]

for column in hillshade_columns:
    df[column] = df[column].clip(0, 255)

In [13]:
df["Sum_Hillshade"] = df[hillshade_columns].sum(axis=1)

In [14]:
for col0, col1 in combinations(hillshade_columns, 2):
    df[f"Diff_{col0}_{col1}"] = df[col0] - df[col1]

In [15]:
horizontal_columns = [*filter(lambda x: x.startswith("Horizontal"), df)]

for col0, col1 in combinations(horizontal_columns, 2):
    df[f"Diff_{col0}_{col1}"] = df[col0] - df[col1]

In [16]:
df_info(df)

Unnamed: 0,unique,missing,dtype,count,mean,std,min,25%,50%,75%,max
Id,5000000,0,int64,5000000.0,2500000.0,1443376.0,0.0,1250000.0,2500000.0,3749999.0,4999999.0
Elevation,2548,0,int64,5000000.0,2978.074,300.5701,1773.0,2754.0,2967.0,3222.0,4383.0
Aspect,360,0,int64,5000000.0,149.9568,108.8817,0.0,59.0,121.0,245.0,359.0
Slope,68,0,int64,5000000.0,15.06272,8.529145,-3.0,9.0,14.0,20.0,64.0
Horizontal_Distance_To_Hydrology,1645,0,int64,5000000.0,269.926,226.8518,-136.0,109.0,211.0,360.0,1602.0
Vertical_Distance_To_Hydrology,922,0,int64,5000000.0,51.62617,68.27695,-329.0,4.0,31.0,78.0,647.0
Horizontal_Distance_To_Roadways,7786,0,int64,5000000.0,1756.015,1327.854,-287.0,807.0,1411.0,2347.0,7666.0
Hillshade_9am,256,0,int64,5000000.0,211.7821,30.83571,0.0,197.0,218.0,234.0,255.0
Hillshade_Noon,204,0,int64,5000000.0,221.1222,22.14103,49.0,210.0,224.0,237.0,255.0
Hillshade_3pm,256,0,int64,5000000.0,140.6678,43.85597,0.0,115.0,142.0,169.0,255.0


In [17]:
df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,L1_Distance_To_Hydrology,L2_Distance_To_Hydrology,Count_Wilderness,Count_Soil,Sum_Hillshade,Diff_Hillshade_9am_Hillshade_Noon,Diff_Hillshade_9am_Hillshade_3pm,Diff_Hillshade_Noon_Hillshade_3pm,Diff_Horizontal_Distance_To_Hydrology_Horizontal_Distance_To_Roadways,Diff_Horizontal_Distance_To_Hydrology_Horizontal_Distance_To_Fire_Points,Diff_Horizontal_Distance_To_Roadways_Horizontal_Distance_To_Fire_Points
0,0,3189,40,8,30,13,3270,206,234,193,4873,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,43,32.695565,1,1,633,-28,13,41,-3240,-4843,-1603
1,1,3026,182,5,280,29,3270,233,240,106,5423,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,309,281.49778,1,1,579,-7,127,134,-2990,-5143,-2153
2,2,3106,13,7,351,37,2914,208,234,137,5269,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,388,352.944755,1,1,579,-26,71,97,-2563,-4918,-2355
3,3,3022,276,13,192,16,3034,207,238,156,2866,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,208,192.665513,1,1,601,-31,51,82,-2842,-2674,168
4,4,2906,186,13,266,22,2916,231,231,154,2642,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,288,266.908224,1,1,616,0,77,77,-2650,-2376,274


In [18]:
scaling_columns = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
    "L1_Distance_To_Hydrology",
    "L2_Distance_To_Hydrology",
    "Count_Wilderness",
    "Count_Soil",
    "Sum_Hillshade",
    "Diff_Hillshade_9am_Hillshade_Noon",
    "Diff_Hillshade_9am_Hillshade_3pm",
    "Diff_Hillshade_Noon_Hillshade_3pm",
    "Diff_Horizontal_Distance_To_Hydrology_Horizontal_Distance_To_Roadways",
    "Diff_Horizontal_Distance_To_Hydrology_Horizontal_Distance_To_Fire_Points",
    "Diff_Horizontal_Distance_To_Roadways_Horizontal_Distance_To_Fire_Points",
]

In [19]:
from sklearn.preprocessing import RobustScaler

In [20]:
scaler = RobustScaler()
df[scaling_columns] = scaler.fit_transform(df[scaling_columns])

In [21]:
df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,L1_Distance_To_Hydrology,L2_Distance_To_Hydrology,Count_Wilderness,Count_Soil,Sum_Hillshade,Diff_Hillshade_9am_Hillshade_Noon,Diff_Hillshade_9am_Hillshade_3pm,Diff_Hillshade_Noon_Hillshade_3pm,Diff_Horizontal_Distance_To_Hydrology_Horizontal_Distance_To_Roadways,Diff_Horizontal_Distance_To_Hydrology_Horizontal_Distance_To_Fire_Points,Diff_Horizontal_Distance_To_Roadways_Horizontal_Distance_To_Fire_Points
0,0,0.474359,-0.435484,-0.545455,-0.721116,-0.243243,1.207143,-0.324324,0.37037,0.944444,2.675076,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,-0.850746,-0.828112,0.0,0.0,0.756757,-0.466667,-0.869565,-0.645161,-1.326544,-2.810994,-0.887226
1,1,0.126068,0.327957,-0.818182,0.2749,-0.027027,1.207143,0.405405,0.592593,-0.666667,3.095566,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,0.141791,0.205619,0.0,0.0,0.027027,0.0,0.782609,0.854839,-1.167409,-3.036898,-1.181187
2,2,0.297009,-0.580645,-0.636364,0.557769,0.081081,0.975974,-0.27027,0.37037,-0.092593,2.977829,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,0.436567,0.502469,0.0,0.0,0.027027,-0.422222,-0.028986,0.258065,-0.895608,-2.86747,-1.28915
3,3,0.117521,0.833333,-0.090909,-0.075697,-0.202703,1.053896,-0.297297,0.518519,0.259259,1.140673,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,-0.235075,-0.163464,0.0,0.0,0.324324,-0.533333,-0.318841,0.016129,-1.073202,-1.177711,0.059327
4,4,-0.130342,0.349462,-0.090909,0.219124,-0.121622,0.977273,0.351351,0.259259,0.222222,0.969419,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,0.063433,0.145002,0.0,0.0,0.527027,0.155556,0.057971,-0.064516,-0.950987,-0.953313,0.115981


In [22]:
df_train, df_test = df[:len(df_train)], df[len(df_train):]

In [23]:
drop_index = df_train[df_train["Cover_Type"] == 5].index
drop_index

Int64Index([3403875], dtype='int64')

In [24]:
df_train.drop(drop_index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [25]:
X_train = df_train.drop(["Id", "Cover_Type"], axis=1)
y_train = df_train["Cover_Type"].astype(np.int8)
X_test = df_test.drop(["Id", "Cover_Type"], axis=1)

y_train.value_counts()

2    2262087
1    1468136
3     195712
7      62261
6      11426
4        377
Name: Cover_Type, dtype: int64

In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
le = LabelEncoder()
y_train = pd.Series(le.fit_transform(y_train))

# Training

In [28]:
from model_nn import ModelNN
from runner import Runner
from keras.callbacks import EarlyStopping

In [31]:
params = {
    "units_list": [300, 200, 100, 50],
    "dropout": 0.2,
    "num_classes": 6
}

train_params = {
    "epochs": 5000,
    "batch_size": 32,
    "verbose": 1,
    "callbacks": [EarlyStopping(monitor="val_loss", min_delta=0, patience=30, verbose=1)]
}

In [32]:
runner = Runner(PROJECT_NAME, ModelNN, cv=False)
runner.train(X_train, y_train, params, train_params)

Epoch 1/5000


2022-01-02 14:14:13.969756: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


  4113/100000 [>.............................] - ETA: 32:20 - loss: 0.3381 - accuracy: 0.8752

KeyboardInterrupt: 

In [76]:
runner.model.feature_importance_

Elevation                                                                   2.011613e+08
Count_Soil                                                                  1.057809e+07
Diff_Horizontal_Distance_To_Hydrology_Horizontal_Distance_To_Roadways       8.103541e+06
Horizontal_Distance_To_Roadways                                             7.867348e+06
Wilderness_Area3                                                            6.445394e+06
Diff_Horizontal_Distance_To_Hydrology_Horizontal_Distance_To_Fire_Points    5.388088e+06
Vertical_Distance_To_Hydrology                                              5.349658e+06
Wilderness_Area1                                                            4.701955e+06
Horizontal_Distance_To_Fire_Points                                          4.046873e+06
Diff_Horizontal_Distance_To_Roadways_Horizontal_Distance_To_Fire_Points     3.272390e+06
Wilderness_Area4                                                            2.151895e+06
Soil_Type39          

# inference

In [88]:
with timer("prediction", logger):
    prob = runner.predict(X_test)

prob

2022/01/02 13:32:02 46 [INFO] [prediction] start.
2022/01/02 13:32:02 46 [INFO] [prediction] start.
2022/01/02 13:33:16 48 [INFO] [prediction] done in 73.641 seconds.
2022/01/02 13:33:16 48 [INFO] [prediction] done in 73.641 seconds.


array([[3.36619157e-05, 9.99835530e-01, 1.28587610e-04, 4.89611023e-09,
        1.50325444e-06, 7.12450095e-07],
       [1.37330565e-02, 9.86153922e-01, 7.67985847e-05, 3.85585105e-08,
        2.93994590e-05, 6.78449268e-06],
       [2.85309869e-03, 9.97131825e-01, 1.29646858e-05, 1.00109336e-08,
        3.50051164e-07, 1.75168919e-06],
       ...,
       [1.10528499e-04, 9.99771648e-01, 1.13202218e-04, 1.18115499e-08,
        2.28970965e-06, 2.31973603e-06],
       [9.99341469e-01, 4.27270142e-04, 5.15522312e-06, 2.07411706e-08,
        6.16899976e-07, 2.25468020e-04],
       [1.93399716e-04, 1.48365812e-01, 8.51314837e-01, 4.55391854e-08,
        1.20103592e-04, 5.80226378e-06]])

In [89]:
pred = prob.argmax(axis=1)
pred = le.inverse_transform(pred)
pred

array([2, 2, 2, ..., 2, 1, 3], dtype=int8)

In [90]:
res = pd.DataFrame({
    "Id": df_test["Id"],
    "Cover_Type": pred
})
res.to_csv(f"../submission/submission_{PROJECT_NAME}.csv", index=False)
res

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2
...,...,...
999995,4999995,2
999996,4999996,1
999997,4999997,2
999998,4999998,1


In [92]:
!kaggle competitions submit tabular-playground-series-dec-2021 -f ../submission/submission_{PROJECT_NAME}.csv -m "valid_1's multi_error: 0.03877"

100%|███████████████████████████████████████| 9.54M/9.54M [00:15<00:00, 660kB/s]
Successfully submitted to Tabular Playground Series - Dec 2021