In [1]:
import os
import random
import gzip

import catboost
import numpy as np
import pandas as pd
import xarray
from sklearn.metrics import roc_auc_score

import geopandas
import gdal, ogr
from shapely import wkb
from shapely.geometry import box
from collections import defaultdict
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

SEED = 42
VAL_MONTHS = 6

ITERATIONS = 1000

DATA_PATH = '../data'
MODELS_PATH = './'
OSM_GEO_DATA = os.path.join(MODELS_PATH, 'russia.osm.gpkg.gz')

In [2]:
def reseed(seed=SEED):
    np.random.seed(seed)
    random.seed(seed)


def evaluate(y_true, y_pred):
    gt = np.zeros_like(y_pred, dtype=np.int8)
    gt[np.arange(y_true.shape[0]), y_true - 1] = 1
    result = {'roc_auc_micro': roc_auc_score(gt, y_pred, average='micro')}
    for ft in range(1, 12):
        gt = (y_true == ft)
        if gt.max() == gt.min():
            roc_auc = 0
        else:
            roc_auc = roc_auc_score(gt, y_pred[:, ft - 1])
        result[f'roc_auc_{ft}'] = roc_auc
    return result

In [3]:
def preprocess(df):
    df['longitude'] = df['longitude'].astype(np.float32)
    df['latitude'] = df['latitude'].astype(np.float32)
    df['weekday'] = df.date.dt.weekday.astype(np.int8)
    df['month'] = df.date.dt.month.astype(np.int8)
    df['ym'] = (df.date.dt.month + (df.date.dt.year - 2000) * 12).astype(np.int16)
    df['fire_type'] = df.fire_type.astype(np.uint8)
    df.set_index('fire_id', inplace=True)
    df.drop(['fire_type_name'], axis=1, inplace=True)


def load_ncep_var(var, press_level):
    result = []
    for year in range(2012, 2020):
        dataset_filename = os.path.join(DATA_PATH, 'ncep', f'{var}.{year}.nc')
        ds = xarray.open_dataset(dataset_filename)
        ds = ds.sel(drop=True, level=press_level)[var]
        ds = ds[:, (ds.lat >= 15 * 2.5 - 0.1) & (ds.lat <= 29 * 2.5 + 0.1),
             (ds.lon >= 6 * 2.5 - 0.1) & (ds.lon <= 71 * 2.5 + 0.1)]
        result.append(ds)
    ds = xarray.merge(result)
    df = ds.to_dataframe()[[var]].reset_index()

    df = df.merge(ds.rolling(time=7).mean().to_dataframe()[[var]].reset_index(),
                  on=['lon', 'lat', 'time'], suffixes=('', '_7d'), how='left')
    df = df.merge(ds.rolling(time=14).mean().to_dataframe()[[var]].reset_index(),
                  on=['lon', 'lat', 'time'], suffixes=('', '_14d'), how='left')
    df = df.merge(ds.rolling(time=30).mean().to_dataframe()[[var]].reset_index(),
                  on=['lon', 'lat', 'time'], suffixes=('', '_30d'), how='left')

    df['lat'] = np.round(df.lat / 2.5).astype(np.int8)
    df['lon'] = np.round(df.lon / 2.5).astype(np.int8)
    return df.copy()


def add_ncep_features(df):
    df['lon'] = np.round(df.longitude / 2.5).astype(np.int8)
    df['lat'] = np.round(df.latitude / 2.5).astype(np.int8)
    for var, press_level in (('air', 1000), ('uwnd', 1000), ('rhum', 1000)):
        var_df = load_ncep_var(var, press_level)
        mdf = df.reset_index().merge(var_df, left_on=['lon', 'lat', 'date'], right_on=['lon', 'lat', 'time'],
                                     how='left', ).set_index('fire_id')
        for suffix in ('', '_7d', '_14d', '_30d'):
            df[var + suffix] = mdf[var + suffix]
    df.drop(['lon', 'lat'], axis=1, inplace=True)


def add_osm_features(df):
    with gzip.open(OSM_GEO_DATA, 'rb') as f:
        osm_df = geopandas.read_file(f, crs="epsg:4326")
    POINT_SIZE_X=0.1
    POINT_SIZE_Y=0.1
    geo_df = df.reset_index()
    geo_df=geopandas.GeoDataFrame(
        geo_df[['fire_id']], 
        geometry = geo_df.apply(lambda x:box(
            x.longitude-POINT_SIZE_X/2, x.latitude-POINT_SIZE_Y/2,
            x.longitude+POINT_SIZE_X/2, x.latitude+POINT_SIZE_Y/2
        ),axis=1), crs="+init=epsg:4326")
    
    geo_features=geopandas.\
        sjoin(geo_df, osm_df.drop(['ids','names'], axis=1), how='left', op='intersects').\
        drop(['geometry','index_right'], axis=1).\
        groupby('fire_id').\
        mean().fillna(0)
    
    for col in geo_features.columns:
        df[col]=geo_features[col]


def prepare_dataset(filename):
    df = pd.read_csv(filename, parse_dates=['date'])
    preprocess(df)
    add_ncep_features(df)
    add_osm_features(df)
    return df

In [4]:
def train_model(df_train):
    last_month = df_train.ym.max()
    train = df_train[df_train.ym <= last_month - VAL_MONTHS]
    val = df_train[df_train.ym > last_month - VAL_MONTHS]
    X_train = train.drop(['fire_type', 'ym', 'date'], axis=1)
    Y_train = train.fire_type
    X_val = val.drop(['fire_type', 'ym', 'date'], axis=1)
    Y_val = val.fire_type
    clf = catboost.CatBoostClassifier(loss_function='MultiClass',
                                      verbose=10, random_state=SEED, iterations=ITERATIONS)
    clf.fit(X_train, Y_train, eval_set=(X_val, Y_val))
    pred_train = clf.predict_proba(X_train)
    pred_val = clf.predict_proba(X_val)
    train_scores = evaluate(Y_train, pred_train)
    val_scores = evaluate(Y_val, pred_val)
    print("Train scores:")
    for k, v in train_scores.items():
        print("%s\t%f" % (k, v))
    print("Validation scores:")
    for k, v in val_scores.items():
        print("%s\t%f" % (k, v))
    clf.save_model(os.path.join(MODELS_PATH, 'catboost.cbm'))

In [5]:
reseed()
df_train = prepare_dataset(os.path.join(DATA_PATH, 'wildfires_train.csv'))
df_train.head()

Unnamed: 0_level_0,date,latitude,longitude,fire_type,weekday,month,ym,air,air_7d,air_14d,air_30d,uwnd,uwnd_7d,uwnd_14d,uwnd_30d,rhum,rhum_7d,rhum_14d,rhum_30d,city,town,village,neighbourhood,hamlet,locality,continent,suburb,isolated_dwelling,allotments,island,region,sea,county,mountain_range,peninsula,quarter,islet,country,state,farm,archipelago,islands,allotments_set,historic,subdistrict,square,wall,дом Малькова,plot,yard,neighbouhood,unknown,wood,school,water,yes,residential,wetland,forest,commercial,apartments,scrub,public,university,stadium,grassland,hospital,reservoir,office,college,kindergarten,clinic,grass,meadow,hotel,peat_cutting,farmland,industrial,construction,education,garages,heath,quarry,village_green,fell,spit,municipality,policlinic,civic,store,recreation_ground,landfill,cemetery,train_station,national_reserve,orchard,farmyard,sand,retail,beach,castle,offices,railway,bay,natural_reserve,lava,station,military,greenfield,cathedral,mud,dormitory,brownfield,service,grandstand,building:part,house,garage,roof,church,goverment,greenhouse_horticulture,basin,depot
fire_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1
0,2012-01-01,42.913441,133.887375,4,6,1,145,267.690002,,,,-0.919998,,,,73.0,,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2012-01-01,43.378616,131.772263,3,6,1,145,263.070007,,,,0.979996,,,,73.5,,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2012-01-01,42.634132,130.479111,4,6,1,145,259.190002,,,,2.020004,,,,70.5,,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2012-01-02,43.108372,132.001053,11,0,1,145,261.649994,,,,1.789993,,,,74.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2012-01-02,42.890823,131.337418,4,0,1,145,261.649994,,,,1.789993,,,,74.0,,,,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
train_model(df_train)

0:	learn: -2.3238845	test: -2.3252709	best: -2.3252709 (0)	total: 1.45s	remaining: 24m 10s
10:	learn: -1.9087596	test: -1.9114929	best: -1.9114929 (10)	total: 15.4s	remaining: 23m 2s
20:	learn: -1.7033276	test: -1.7079372	best: -1.7079372 (20)	total: 29.8s	remaining: 23m 9s
30:	learn: -1.5798891	test: -1.5898738	best: -1.5898738 (30)	total: 44.4s	remaining: 23m 6s
40:	learn: -1.4987338	test: -1.5119071	best: -1.5119071 (40)	total: 58.1s	remaining: 22m 39s
50:	learn: -1.4423772	test: -1.4580228	best: -1.4580228 (50)	total: 1m 12s	remaining: 22m 23s
60:	learn: -1.4015301	test: -1.4168775	best: -1.4168775 (60)	total: 1m 25s	remaining: 21m 55s
70:	learn: -1.3706961	test: -1.3888371	best: -1.3888371 (70)	total: 1m 44s	remaining: 22m 51s
80:	learn: -1.3456407	test: -1.3664200	best: -1.3664200 (80)	total: 2m 4s	remaining: 23m 35s
90:	learn: -1.3257957	test: -1.3490645	best: -1.3490645 (90)	total: 2m 26s	remaining: 24m 26s
100:	learn: -1.3090453	test: -1.3357869	best: -1.3357869 (100)	total: 2

860:	learn: -1.0796545	test: -1.2742185	best: -1.2725547 (741)	total: 19m 31s	remaining: 3m 9s
870:	learn: -1.0784935	test: -1.2742754	best: -1.2725547 (741)	total: 19m 42s	remaining: 2m 55s
880:	learn: -1.0771118	test: -1.2742547	best: -1.2725547 (741)	total: 19m 53s	remaining: 2m 41s
890:	learn: -1.0758738	test: -1.2744624	best: -1.2725547 (741)	total: 20m 5s	remaining: 2m 27s
900:	learn: -1.0746544	test: -1.2744697	best: -1.2725547 (741)	total: 20m 16s	remaining: 2m 13s
910:	learn: -1.0734512	test: -1.2742781	best: -1.2725547 (741)	total: 20m 27s	remaining: 1m 59s
920:	learn: -1.0723904	test: -1.2740988	best: -1.2725547 (741)	total: 20m 39s	remaining: 1m 46s
930:	learn: -1.0715045	test: -1.2740738	best: -1.2725547 (741)	total: 20m 49s	remaining: 1m 32s
940:	learn: -1.0706211	test: -1.2740541	best: -1.2725547 (741)	total: 20m 59s	remaining: 1m 18s
950:	learn: -1.0695847	test: -1.2746465	best: -1.2725547 (741)	total: 21m 10s	remaining: 1m 5s
960:	learn: -1.0686575	test: -1.2748889	bes