In [1]:
import pickle
from sklearn.preprocessing import LabelEncoder

import random
import os
import pandas as pd
import xarray
import requests
import numpy as np
from tqdm import tqdm_notebook as tqdm
from datetime import datetime

from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
from sklearn.neighbors import KDTree
from tqdm import tqdm_notebook
from sklearn.cluster import DBSCAN

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import catboost

SEED = 42
VAL_MONTHS = 6

ITERATIONS = 1000

DATA_PATH = '../data'
MODELS_PATH = './'

df_train = pd.read_csv('data/wildfires_train.csv')

def data_preproc(df_output): 
    df_output['longitude'] = df_output['longitude'].astype(np.float32)
    df_output['latitude'] = df_output['latitude'].astype(np.float32)
    df_output.date=pd.to_datetime(df_output.date)
    #df_output['weekday'] = df_output.date.dt.weekday.astype(np.int8)
    df_output['month'] = df_output.date.dt.month.astype(np.int8)
    #df_output['year'] = df_output.date.dt.year.astype(np.int16)
    df_output['ym'] = (df_output.date.dt.month + (df_output.date.dt.year - 2000) * 12).astype(np.int16)

    df_output.set_index('fire_id', inplace=True)
    df_output.drop(['fire_type_name'], axis=1, inplace=True, errors='ignore')
    
    #df_output['season'] = df_output.month.apply(lambda x: (x%12 + 3)//3)
    df_output['day_number'] = df_output.date.apply(lambda x: x.strftime('%j')) 
    df_output.day_number = pd.to_numeric(df_output.day_number)
    df_output['sin_day'] = np.sin(2*np.pi*df_output.day_number/365)
    df_output['cos_day'] = np.cos(2*np.pi*df_output.day_number/365)

def make_geo_features(df):
    coords =[np.array(df.latitude), np.array(df.longitude)]
    coords = np.array(coords).T

    radiuses = {
        "1km":0.02,
        "2km":0.03,
        "3km":0.05,
    }

    for r in tqdm_notebook(radiuses.keys()):
        overall_tree = KDTree(coords)
        df["{}_overall".format(r)] = overall_tree.query_radius(coords[:,:], r=radiuses[r], count_only=True)
        del overall_tree
        
    dbsc = DBSCAN(eps=.02)
    df["dbscan_label"] = dbsc.fit_predict(coords)
    
    city_dict = df.dbscan_label.value_counts().to_dict()
    df["dbscan_label_count"] = df["dbscan_label"].replace(city_dict)
    df.loc[df.dbscan_label == -1, "dbscan_label_count"] = 1
    
def encode_cities(df, columns):
    for column in columns:
        df[column]=LabelEncoder().fit_transform(df[column])
    
def add_cities_encode(df):
    coords =[np.array(df.latitude), np.array(df.longitude)]
    coords = np.array(coords).T

    cdf = pd.read_csv("data/cities.csv")
    cdf = cdf[["Регион", "Район", "Город", "Признак центра района или региона", "Широта", "Долгота", "Федеральный округ"]]
    cdf.columns = ["region", "subregion", "city", "is_reg_center", "lat", "lon", "federal_distr"]
    cdf.loc[cdf.region.isin(["Москва", "Санкт-Петербург"]), "is_reg_center"] = 2
    cdf.loc[pd.isna(cdf.subregion), "subregion"] = cdf.loc[pd.isna(cdf.subregion), "region"]
    cdf.loc[pd.isna(cdf.city), "city"] = cdf.loc[pd.isna(cdf.city), "subregion"]
    #cdf["full_geo"] = cdf.apply(lambda x: " / ".join([x.region, x.subregion, x.city]), axis=1)
    city_tree = KDTree(cdf[["lat", "lon"]].values)
    df["city_distance"], df["city_id"] = city_tree.query(coords[:,:])
    df = df.merge(cdf, left_on="city_id", right_index=True)
    df = df.drop(['lat', 'lon', 'city_id'], axis=1)

    encode_cities(df, ["region", "subregion", "city", "federal_distr"])
    #df=pd.get_dummies(df, prefix=['reg'], columns=['federal_distr'])
    #cat_enc = ce.cat_boost.CatBoostEncoder(
    #cols = ["region", "subregion", "city", "federal_distr"], handle_missing = "value", handle_unknown = "value", return_df = True,)
    #df[["region", "subregion", "city", "federal_distr"]]=cat_enc.fit_transform(df[["region", "subregion", "city", "federal_distr"]], df.fire_type)
    
    #with open('encoder.pickle', 'wb') as fout:
        #pickle.dump(cat_enc, fout, protocol=pickle.HIGHEST_PROTOCOL)       
    return df
    
def evaluate(y_true, y_pred):
    gt = np.zeros_like(y_pred, dtype=np.int8)
    gt[np.arange(y_true.shape[0]), y_true - 1] = 1
    result = {'roc_auc_micro': roc_auc_score(gt, y_pred, average='micro')}
    for ft in range(1, 12):
        gt = (y_true == ft)
        if gt.max() == gt.min():
            roc_auc = 0
        else:
            roc_auc = roc_auc_score(gt, y_pred[:, ft - 1])
        result[f'roc_auc_{ft}'] = roc_auc
    return result

In [2]:
data_preproc(df_train)
make_geo_features(df_train)
df_train=add_cities_encode(df_train)
df_train

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




Unnamed: 0_level_0,date,latitude,longitude,fire_type,month,ym,day_number,sin_day,cos_day,1km_overall,2km_overall,3km_overall,dbscan_label,dbscan_label_count,city_distance,region,subregion,city,is_reg_center,federal_distr
fire_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,2012-01-01,42.913441,133.887375,4,1,145,1,0.017213,0.999852,9,17,25,0,31,0.790567,52,428,697,0,0
1225,2012-05-29,42.938869,133.166107,8,5,149,150,0.530730,-0.847541,6,11,27,48,392,0.193222,52,428,697,0,0
9157,2012-11-03,43.077335,133.070526,10,11,155,308,-0.831171,0.556017,6,14,30,48,392,0.075469,52,428,697,0,0
9164,2012-11-03,43.387684,133.350159,10,11,155,308,-0.831171,0.556017,11,17,30,340,29,0.342752,52,428,697,0,0
9185,2012-11-03,43.359482,133.343323,10,11,155,308,-0.831171,0.556017,8,17,34,340,29,0.317198,52,428,697,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142784,2018-04-19,58.344387,59.834019,9,4,220,109,0.953681,-0.300820,1,1,1,-1,1,0.021299,61,462,147,0,5
151888,2018-08-19,44.551430,34.143875,11,8,224,231,-0.741222,-0.671260,1,1,1,-1,1,0.060573,33,267,1061,0,7
164897,2019-04-05,58.085464,59.755451,4,4,232,95,0.997917,-0.064508,1,1,1,-1,1,0.197289,61,462,479,0,5
173816,2019-04-25,57.425827,41.977325,9,4,232,115,0.917584,-0.397543,1,1,2,-1,1,0.045172,17,221,584,0,6


In [3]:
X = df_train.drop(['date', 'fire_type'], axis=1).fillna(0)
y = df_train['fire_type']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = catboost.CatBoostClassifier(loss_function='MultiClass',
                                      verbose=10, random_state=SEED, iterations=ITERATIONS)
clf.fit(X, y)

0:	learn: 2.3246463	total: 459ms	remaining: 7m 38s
10:	learn: 1.9044461	total: 3.88s	remaining: 5m 48s
20:	learn: 1.6968130	total: 7.01s	remaining: 5m 27s
30:	learn: 1.5615544	total: 10.2s	remaining: 5m 18s
40:	learn: 1.4722605	total: 14s	remaining: 5m 28s
50:	learn: 1.4084989	total: 21.6s	remaining: 6m 41s
60:	learn: 1.3617502	total: 36.3s	remaining: 9m 18s
70:	learn: 1.3265868	total: 43.9s	remaining: 9m 34s
80:	learn: 1.2987592	total: 47.6s	remaining: 9m
90:	learn: 1.2763307	total: 51s	remaining: 8m 29s
100:	learn: 1.2587100	total: 54.5s	remaining: 8m 5s
110:	learn: 1.2426790	total: 57.8s	remaining: 7m 42s
120:	learn: 1.2286366	total: 1m 1s	remaining: 7m 23s
130:	learn: 1.2169704	total: 1m 4s	remaining: 7m 8s
140:	learn: 1.2067556	total: 1m 8s	remaining: 6m 59s
150:	learn: 1.1964181	total: 1m 13s	remaining: 6m 53s
160:	learn: 1.1866651	total: 1m 16s	remaining: 6m 39s
170:	learn: 1.1796454	total: 1m 20s	remaining: 6m 27s
180:	learn: 1.1720155	total: 1m 23s	remaining: 6m 17s
190:	learn

<catboost.core.CatBoostClassifier at 0x10c859b70>

In [4]:
with open('model_clf_cities.pickle', 'wb') as fout:
    pickle.dump(clf, fout, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
X.columns

Index(['latitude', 'longitude', 'month', 'ym', 'day_number', 'sin_day',
       'cos_day', '1km_overall', '2km_overall', '3km_overall', 'dbscan_label',
       'dbscan_label_count', 'city_distance', 'region', 'subregion', 'city',
       'is_reg_center', 'federal_distr'],
      dtype='object')

In [6]:
df_test = pd.read_csv('data/wildfires_check.csv')
data_preproc(df_test)
make_geo_features(df_test)
df_test=add_cities_encode(df_test.drop(['fire_type'], axis=1))

with open('model_clf_cities.pickle', 'rb') as fin:
        classify = pickle.load(fin)

df_test.drop(['date'], axis=1, inplace=True)
classify.predict_proba(df_test)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




array([[5.02604515e-04, 2.13972134e-03, 2.06811900e-02, 3.01755194e-03,
        1.95295515e-02, 1.57444786e-01, 5.96273824e-05, 3.61641367e-01,
        3.14412077e-01, 6.70230234e-02, 5.35484999e-02],
       [2.94915199e-04, 1.40175487e-02, 3.16638675e-02, 2.36048155e-03,
        1.95641924e-02, 6.28381264e-02, 5.91838899e-05, 3.46090139e-01,
        4.14960353e-01, 5.87981453e-02, 4.93530473e-02],
       [1.35611397e-03, 9.87422029e-04, 6.21091008e-03, 3.65116469e-02,
        7.68839045e-03, 6.73726191e-03, 8.78164542e-05, 1.16303784e-01,
        2.20795252e-01, 5.55880270e-01, 4.74411329e-02],
       [1.42416555e-03, 2.80406636e-03, 2.05529932e-02, 1.99778253e-02,
        1.42522987e-02, 1.11293413e-02, 7.81364922e-05, 1.93192576e-01,
        3.35651937e-01, 3.50289056e-01, 5.06476037e-02],
       [1.87226879e-04, 4.74234749e-03, 2.31055421e-02, 3.44084647e-03,
        1.34348927e-02, 2.29828984e-01, 2.26284495e-05, 3.81172140e-03,
        2.68504232e-01, 2.88043526e-02, 4.24117227e-

In [7]:
print("With encoding")
for i in range(len(clf.feature_names_)):
    print(clf.feature_names_[i], clf.feature_importances_[i])

With encoding
latitude 15.058756778895225
longitude 21.90747515620583
month 1.0223803361687755
ym 12.454382471106316
day_number 3.33049118194143
sin_day 5.682708942584788
cos_day 10.748237736184986
1km_overall 0.9471717580081009
2km_overall 0.4419231803316369
3km_overall 1.949848913951407
dbscan_label 0.18788390933626994
dbscan_label_count 0.26862741454057554
city_distance 3.2130232721799357
region 9.020019392413294
subregion 2.0813921912248676
city 2.3037170420478863
is_reg_center 0.9935071452929399
federal_distr 8.388453177585749


### сохранить модель в файл

In [8]:
import pickle

with open('model_clf_cities.pickle', 'wb') as fout:
    pickle.dump(clf, fout, protocol=pickle.HIGHEST_PROTOCOL)