In [1]:
import os
import random

import catboost
import numpy as np
import pandas as pd
import xarray
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

SEED = 42
VAL_MONTHS = 6

ITERATIONS = 1000

DATA_PATH = '../data'
MODELS_PATH = './'

In [2]:
def reseed(seed=SEED):
    np.random.seed(seed)
    random.seed(seed)


def evaluate(y_true, y_pred):
    gt = np.zeros_like(y_pred, dtype=np.int8)
    gt[np.arange(y_true.shape[0]), y_true - 1] = 1
    result = {'roc_auc_micro': roc_auc_score(gt, y_pred, average='micro')}
    for ft in range(1, 12):
        gt = (y_true == ft)
        if gt.max() == gt.min():
            roc_auc = 0
        else:
            roc_auc = roc_auc_score(gt, y_pred[:, ft - 1])
        result[f'roc_auc_{ft}'] = roc_auc
    return result

In [3]:
def preprocess(df):
    df['longitude'] = df['longitude'].astype(np.float32)
    df['latitude'] = df['latitude'].astype(np.float32)
    df['weekday'] = df.date.dt.weekday.astype(np.int8)
    df['month'] = df.date.dt.month.astype(np.int8)
    df['ym'] = (df.date.dt.month + (df.date.dt.year - 2000) * 12).astype(np.int16)
    df['fire_type'] = df.fire_type.astype(np.uint8)
    df.set_index('point_id', inplace=True)
    df.drop(['fire_type_name'], axis=1, inplace=True)


def load_ncep_var(var, press_level):
    result = []
    for year in range(2012, 2020):
        dataset_filename = os.path.join(DATA_PATH, 'ncep', f'{var}.{year}.nc')
        ds = xarray.open_dataset(dataset_filename)
        ds = ds.sel(drop=True, level=press_level)[var]
        ds = ds[:, (ds.lat >= 15 * 2.5 - 0.1) & (ds.lat <= 29 * 2.5 + 0.1),
             (ds.lon >= 6 * 2.5 - 0.1) & (ds.lon <= 71 * 2.5 + 0.1)]
        result.append(ds)
    ds = xarray.merge(result)
    df = ds.to_dataframe()[[var]].reset_index()

    df = df.merge(ds.rolling(time=7).mean().to_dataframe()[[var]].reset_index(),
                  on=['lon', 'lat', 'time'], suffixes=('', '_7d'), how='left')
    df = df.merge(ds.rolling(time=14).mean().to_dataframe()[[var]].reset_index(),
                  on=['lon', 'lat', 'time'], suffixes=('', '_14d'), how='left')
    df = df.merge(ds.rolling(time=30).mean().to_dataframe()[[var]].reset_index(),
                  on=['lon', 'lat', 'time'], suffixes=('', '_30d'), how='left')

    df['lat'] = np.round(df.lat / 2.5).astype(np.int8)
    df['lon'] = np.round(df.lon / 2.5).astype(np.int8)
    return df.copy()


def add_ncep_features(df):
    df['lon'] = np.round(df.longitude / 2.5).astype(np.int8)
    df['lat'] = np.round(df.latitude / 2.5).astype(np.int8)
    for var, press_level in (('air', 1000), ('uwnd', 1000), ('rhum', 1000)):
        var_df = load_ncep_var(var, press_level)
        mdf = df.reset_index().merge(var_df, left_on=['lon', 'lat', 'date'], right_on=['lon', 'lat', 'time'],
                                     how='left', ).set_index('point_id')
        for suffix in ('', '_7d', '_14d', '_30d'):
            df[var + suffix] = mdf[var + suffix]
    df.drop(['lon', 'lat'], axis=1, inplace=True)


def prepare_dataset(filename):
    df = pd.read_csv(filename, parse_dates=['date'])
    preprocess(df)
    add_ncep_features(df)
    return df

In [4]:
def train_model(df_train):
    last_month = df_train.ym.max()
    train = df_train[df_train.ym <= last_month - VAL_MONTHS]
    val = df_train[df_train.ym > last_month - VAL_MONTHS]
    X_train = train.drop(['fire_type', 'ym', 'date'], axis=1)
    Y_train = train.fire_type
    X_val = val.drop(['fire_type', 'ym', 'date'], axis=1)
    Y_val = val.fire_type
    clf = catboost.CatBoostClassifier(loss_function='MultiClass',
                                      verbose=10, random_state=SEED, iterations=ITERATIONS)
    clf.fit(X_train, Y_train, eval_set=(X_val, Y_val))
    pred_train = clf.predict_proba(X_train)
    pred_val = clf.predict_proba(X_val)
    train_scores = evaluate(Y_train, pred_train)
    val_scores = evaluate(Y_val, pred_val)
    print("Train scores:")
    for k, v in train_scores.items():
        print("%s\t%f" % (k, v))
    print("Validation scores:")
    for k, v in val_scores.items():
        print("%s\t%f" % (k, v))
    clf.save_model(os.path.join(MODELS_PATH, 'catboost.cbm'))

In [5]:
reseed()
df_train = prepare_dataset(os.path.join(DATA_PATH, 'wildfires_train.csv'))
train_model(df_train)

0:	learn: -2.3238845	test: -2.3252709	best: -2.3252709 (0)	total: 498ms	remaining: 8m 17s
10:	learn: -1.9080767	test: -1.9101533	best: -1.9101533 (10)	total: 4.93s	remaining: 7m 23s
20:	learn: -1.7034182	test: -1.7094338	best: -1.7094338 (20)	total: 9.47s	remaining: 7m 21s
30:	learn: -1.5806662	test: -1.5896671	best: -1.5896671 (30)	total: 13.9s	remaining: 7m 16s
40:	learn: -1.4978668	test: -1.5104050	best: -1.5104050 (40)	total: 18.5s	remaining: 7m 12s
50:	learn: -1.4411849	test: -1.4552593	best: -1.4552593 (50)	total: 22.9s	remaining: 7m 5s
60:	learn: -1.4002119	test: -1.4139887	best: -1.4139887 (60)	total: 27s	remaining: 6m 56s
70:	learn: -1.3693466	test: -1.3858163	best: -1.3858163 (70)	total: 31.3s	remaining: 6m 48s
80:	learn: -1.3457633	test: -1.3637098	best: -1.3637098 (80)	total: 35.4s	remaining: 6m 41s
90:	learn: -1.3262349	test: -1.3484557	best: -1.3484557 (90)	total: 39.4s	remaining: 6m 33s
100:	learn: -1.3109866	test: -1.3360279	best: -1.3360279 (100)	total: 43.5s	remaining

880:	learn: -1.0706994	test: -1.2831025	best: -1.2807532 (613)	total: 6m 6s	remaining: 49.5s
890:	learn: -1.0692151	test: -1.2834898	best: -1.2807532 (613)	total: 6m 10s	remaining: 45.4s
900:	learn: -1.0680805	test: -1.2838537	best: -1.2807532 (613)	total: 6m 14s	remaining: 41.2s
910:	learn: -1.0668607	test: -1.2843404	best: -1.2807532 (613)	total: 6m 18s	remaining: 37s
920:	learn: -1.0656807	test: -1.2843885	best: -1.2807532 (613)	total: 6m 23s	remaining: 32.9s
930:	learn: -1.0646256	test: -1.2846452	best: -1.2807532 (613)	total: 6m 27s	remaining: 28.7s
940:	learn: -1.0633917	test: -1.2844535	best: -1.2807532 (613)	total: 6m 31s	remaining: 24.5s
950:	learn: -1.0620970	test: -1.2842736	best: -1.2807532 (613)	total: 6m 35s	remaining: 20.4s
960:	learn: -1.0608761	test: -1.2846385	best: -1.2807532 (613)	total: 6m 39s	remaining: 16.2s
970:	learn: -1.0594621	test: -1.2848528	best: -1.2807532 (613)	total: 6m 43s	remaining: 12.1s
980:	learn: -1.0580934	test: -1.2850119	best: -1.2807532 (613)	