In [1]:
import os
import random

import catboost
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

SEED = 42
VAL_MONTHS = 6

ITERATIONS = 1000

DATA_PATH = '../data'
MODELS_PATH = './'

In [2]:
def reseed(seed=SEED):
    np.random.seed(seed)
    random.seed(seed)

def evaluate(y_true, y_pred):
    gt = np.zeros_like(y_pred, dtype=np.int8)
    gt[np.arange(y_true.shape[0]), y_true - 1] = 1
    result = {'roc_auc_micro': roc_auc_score(gt, y_pred, average='micro')}
    for ft in range(1, 12):
        gt = (y_true == ft)
        if gt.max() == gt.min():
            roc_auc = 0
        else:
            roc_auc = roc_auc_score(gt, y_pred[:, ft - 1])
        result[f'roc_auc_{ft}'] = roc_auc
    return result

In [3]:
def preprocess(df):
    df['longitude'] = df['longitude'].astype(np.float32)
    df['latitude'] = df['latitude'].astype(np.float32)
    df['weekday'] = df.date.dt.weekday.astype(np.int8)
    df['month'] = df.date.dt.month.astype(np.int8)
    df['ym'] = (df.date.dt.month + (df.date.dt.year - 2000) * 12).astype(np.int16)
    df['fire_type'] = df.fire_type.astype(np.uint8)
    df.set_index('point_id', inplace=True)
    df.drop(['fire_type_name'], axis=1, inplace=True)


def prepare_dataset(filename):
    df = pd.read_csv(filename, parse_dates=['date'])
    preprocess(df)
    return df

In [4]:
def train_model(df_train):
    last_month = df_train.ym.max()
    train = df_train[df_train.ym <= last_month - VAL_MONTHS]
    val = df_train[df_train.ym > last_month - VAL_MONTHS]
    X_train = train.drop(['fire_type', 'ym', 'date'], axis=1)
    Y_train = train.fire_type
    X_val = val.drop(['fire_type', 'ym', 'date'], axis=1)
    Y_val = val.fire_type
    clf = catboost.CatBoostClassifier(loss_function='MultiClass',
                                      verbose=10, random_state=SEED, iterations=ITERATIONS)
    clf.fit(X_train, Y_train, eval_set=(X_val, Y_val))
    pred_train = clf.predict_proba(X_train)
    pred_val = clf.predict_proba(X_val)
    train_scores = evaluate(Y_train, pred_train)
    val_scores = evaluate(Y_val, pred_val)
    print("Train scores:")
    for k, v in train_scores.items():
        print("%s\t%f" % (k, v))
    print("Validation scores:")
    for k, v in val_scores.items():
        print("%s\t%f" % (k, v))
    clf.save_model(os.path.join(MODELS_PATH, 'catboost.cbm'))

In [5]:
reseed()
df_train = prepare_dataset(os.path.join(DATA_PATH, 'wildfires_train.csv'))
train_model(df_train)

0:	learn: -2.3262322	test: -2.3266224	best: -2.3266224 (0)	total: 372ms	remaining: 6m 12s
10:	learn: -1.9234626	test: -1.9125231	best: -1.9125231 (10)	total: 3.42s	remaining: 5m 7s
20:	learn: -1.7266773	test: -1.7160860	best: -1.7160860 (20)	total: 6.48s	remaining: 5m 2s
30:	learn: -1.6072554	test: -1.5982247	best: -1.5982247 (30)	total: 9.54s	remaining: 4m 58s
40:	learn: -1.5294895	test: -1.5235478	best: -1.5235478 (40)	total: 12.6s	remaining: 4m 54s
50:	learn: -1.4748060	test: -1.4701073	best: -1.4701073 (50)	total: 15.7s	remaining: 4m 51s
60:	learn: -1.4355296	test: -1.4310235	best: -1.4310235 (60)	total: 18.7s	remaining: 4m 47s
70:	learn: -1.4057404	test: -1.4019649	best: -1.4019649 (70)	total: 21.7s	remaining: 4m 43s
80:	learn: -1.3832953	test: -1.3815872	best: -1.3815872 (80)	total: 24.7s	remaining: 4m 40s
90:	learn: -1.3640343	test: -1.3649534	best: -1.3649534 (90)	total: 27.8s	remaining: 4m 37s
100:	learn: -1.3496644	test: -1.3540525	best: -1.3540525 (100)	total: 30.8s	remainin

880:	learn: -1.1702394	test: -1.3070924	best: -1.3063488 (694)	total: 4m 31s	remaining: 36.7s
890:	learn: -1.1692371	test: -1.3071430	best: -1.3063488 (694)	total: 4m 34s	remaining: 33.6s
900:	learn: -1.1686270	test: -1.3072945	best: -1.3063488 (694)	total: 4m 37s	remaining: 30.5s
910:	learn: -1.1679344	test: -1.3074048	best: -1.3063488 (694)	total: 4m 40s	remaining: 27.4s
920:	learn: -1.1671854	test: -1.3072732	best: -1.3063488 (694)	total: 4m 43s	remaining: 24.3s
930:	learn: -1.1664967	test: -1.3073291	best: -1.3063488 (694)	total: 4m 46s	remaining: 21.2s
940:	learn: -1.1657707	test: -1.3074690	best: -1.3063488 (694)	total: 4m 49s	remaining: 18.2s
950:	learn: -1.1652310	test: -1.3075252	best: -1.3063488 (694)	total: 4m 52s	remaining: 15.1s
960:	learn: -1.1643502	test: -1.3076194	best: -1.3063488 (694)	total: 4m 55s	remaining: 12s
970:	learn: -1.1637731	test: -1.3077637	best: -1.3063488 (694)	total: 4m 58s	remaining: 8.92s
980:	learn: -1.1632415	test: -1.3078433	best: -1.3063488 (694)