In [9]:
import numpy as np
import gc
import pandas as pd
import xgboost as xgb
from tqdm.notebook import tqdm

In [2]:
train = pd.read_csv('../Dataset/train.csv')
test = pd.read_csv('../Dataset/test.csv')
sub = pd.read_csv('../Dataset/sample_submission.csv')
train.shape, test.shape, sub.shape

((122265, 7), (25080, 3), (25080, 2))

In [3]:
train.head()


Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243


In [5]:
test.head()


Unnamed: 0,row_id,cfips,first_day_of_month
0,1001_2022-11-01,1001,2022-11-01
1,1003_2022-11-01,1003,2022-11-01
2,1005_2022-11-01,1005,2022-11-01
3,1007_2022-11-01,1007,2022-11-01
4,1009_2022-11-01,1009,2022-11-01


In [6]:
train['istest'] = 0
test['istest'] = 1
raw = pd.concat((train, test)).sort_values('row_id').reset_index(drop=True)
raw.tail(10)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,istest
147335,9015_2022-09-01,9015,Windham County,Connecticut,2022-09-01,3.567954,3337.0,0
147336,9015_2022-10-01,9015,Windham County,Connecticut,2022-10-01,3.566885,3336.0,0
147337,9015_2022-11-01,9015,,,2022-11-01,,,1
147338,9015_2022-12-01,9015,,,2022-12-01,,,1
147339,9015_2023-01-01,9015,,,2023-01-01,,,1
147340,9015_2023-02-01,9015,,,2023-02-01,,,1
147341,9015_2023-03-01,9015,,,2023-03-01,,,1
147342,9015_2023-04-01,9015,,,2023-04-01,,,1
147343,9015_2023-05-01,9015,,,2023-05-01,,,1
147344,9015_2023-06-01,9015,,,2023-06-01,,,1


In [7]:
raw['first_day_of_month'] = pd.to_datetime(raw["first_day_of_month"])
raw['county'] = raw.groupby('cfips')['county'].ffill()
raw['state'] = raw.groupby('cfips')['state'].ffill()
raw["year"] = raw["first_day_of_month"].dt.year
raw["month"] = raw["first_day_of_month"].dt.month
raw["dcount"] = raw.groupby(['cfips', 'istest'])['row_id'].cumcount()
raw['county_i'] = (raw['county'] + raw['state']).factorize()[0]
raw['state_i'] = raw['state'].factorize()[0]
raw.tail(10)


Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,istest,year,month,dcount,county_i,state_i
147335,9015_2022-09-01,9015,Windham County,Connecticut,2022-09-01,3.567954,3337.0,0,2022,9,37,3134,50
147336,9015_2022-10-01,9015,Windham County,Connecticut,2022-10-01,3.566885,3336.0,0,2022,10,38,3134,50
147337,9015_2022-11-01,9015,Windham County,Connecticut,2022-11-01,,,1,2022,11,0,3134,50
147338,9015_2022-12-01,9015,Windham County,Connecticut,2022-12-01,,,1,2022,12,1,3134,50
147339,9015_2023-01-01,9015,Windham County,Connecticut,2023-01-01,,,1,2023,1,2,3134,50
147340,9015_2023-02-01,9015,Windham County,Connecticut,2023-02-01,,,1,2023,2,3,3134,50
147341,9015_2023-03-01,9015,Windham County,Connecticut,2023-03-01,,,1,2023,3,4,3134,50
147342,9015_2023-04-01,9015,Windham County,Connecticut,2023-04-01,,,1,2023,4,5,3134,50
147343,9015_2023-05-01,9015,Windham County,Connecticut,2023-05-01,,,1,2023,5,6,3134,50
147344,9015_2023-06-01,9015,Windham County,Connecticut,2023-06-01,,,1,2023,6,7,3134,50


In [10]:
def build_features(raw):

    for lag in range(1, 36):
        raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')['microbusiness_density'].shift(lag)
        raw[f'act_lag_{lag}'] = raw.groupby('cfips')['active'].shift(lag)
        raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')[f'mbd_lag_{lag}'].bfill()
        raw[f'act_lag_{lag}'] = raw.groupby('cfips')[f'act_lag_{lag}'].bfill()

    return raw

raw = build_features(raw)
gc.collect()
raw.tail(20)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,istest,year,month,...,mbd_lag_31,act_lag_31,mbd_lag_32,act_lag_32,mbd_lag_33,act_lag_33,mbd_lag_34,act_lag_34,mbd_lag_35,act_lag_35
147325,9015_2021-11-01,9015,Windham County,Connecticut,2021-11-01,3.560204,3319.0,0,2021,11,...,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0
147326,9015_2021-12-01,9015,Windham County,Connecticut,2021-12-01,3.530169,3291.0,0,2021,12,...,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0
147327,9015_2022-01-01,9015,Windham County,Connecticut,2022-01-01,3.524116,3296.0,0,2022,1,...,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0
147328,9015_2022-02-01,9015,Windham County,Connecticut,2022-02-01,3.49204,3266.0,0,2022,2,...,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0
147329,9015_2022-03-01,9015,Windham County,Connecticut,2022-03-01,3.519839,3292.0,0,2022,3,...,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0
147330,9015_2022-04-01,9015,Windham County,Connecticut,2022-04-01,3.538016,3309.0,0,2022,4,...,3.754211,3488.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0
147331,9015_2022-05-01,9015,Windham County,Connecticut,2022-05-01,3.524116,3296.0,0,2022,5,...,3.74883,3483.0,3.754211,3488.0,3.702548,3440.0,3.702548,3440.0,3.702548,3440.0
147332,9015_2022-06-01,9015,Windham County,Connecticut,2022-06-01,3.521978,3294.0,0,2022,6,...,3.796188,3527.0,3.74883,3483.0,3.754211,3488.0,3.702548,3440.0,3.702548,3440.0
147333,9015_2022-07-01,9015,Windham County,Connecticut,2022-07-01,3.574369,3343.0,0,2022,7,...,3.887675,3612.0,3.796188,3527.0,3.74883,3483.0,3.754211,3488.0,3.702548,3440.0
147334,9015_2022-08-01,9015,Windham County,Connecticut,2022-08-01,3.552985,3323.0,0,2022,8,...,3.93253,3658.0,3.887675,3612.0,3.796188,3527.0,3.74883,3483.0,3.754211,3488.0


In [11]:
raw.iloc[-20:,:20]


Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,istest,year,month,dcount,county_i,state_i,mbd_lag_1,act_lag_1,mbd_lag_2,act_lag_2,mbd_lag_3,act_lag_3,mbd_lag_4
147325,9015_2021-11-01,9015,Windham County,Connecticut,2021-11-01,3.560204,3319.0,0,2021,11,27,3134,50,3.53875,3299.0,3.541968,3302.0,3.555913,3315.0,3.536605
147326,9015_2021-12-01,9015,Windham County,Connecticut,2021-12-01,3.530169,3291.0,0,2021,12,28,3134,50,3.560204,3319.0,3.53875,3299.0,3.541968,3302.0,3.555913
147327,9015_2022-01-01,9015,Windham County,Connecticut,2022-01-01,3.524116,3296.0,0,2022,1,29,3134,50,3.530169,3291.0,3.560204,3319.0,3.53875,3299.0,3.541968
147328,9015_2022-02-01,9015,Windham County,Connecticut,2022-02-01,3.49204,3266.0,0,2022,2,30,3134,50,3.524116,3296.0,3.530169,3291.0,3.560204,3319.0,3.53875
147329,9015_2022-03-01,9015,Windham County,Connecticut,2022-03-01,3.519839,3292.0,0,2022,3,31,3134,50,3.49204,3266.0,3.524116,3296.0,3.530169,3291.0,3.560204
147330,9015_2022-04-01,9015,Windham County,Connecticut,2022-04-01,3.538016,3309.0,0,2022,4,32,3134,50,3.519839,3292.0,3.49204,3266.0,3.524116,3296.0,3.530169
147331,9015_2022-05-01,9015,Windham County,Connecticut,2022-05-01,3.524116,3296.0,0,2022,5,33,3134,50,3.538016,3309.0,3.519839,3292.0,3.49204,3266.0,3.524116
147332,9015_2022-06-01,9015,Windham County,Connecticut,2022-06-01,3.521978,3294.0,0,2022,6,34,3134,50,3.524116,3296.0,3.538016,3309.0,3.519839,3292.0,3.49204
147333,9015_2022-07-01,9015,Windham County,Connecticut,2022-07-01,3.574369,3343.0,0,2022,7,35,3134,50,3.521978,3294.0,3.524116,3296.0,3.538016,3309.0,3.519839
147334,9015_2022-08-01,9015,Windham County,Connecticut,2022-08-01,3.552985,3323.0,0,2022,8,36,3134,50,3.574369,3343.0,3.521978,3294.0,3.524116,3296.0,3.538016


In [12]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))

    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)

    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]

    return 100 * np.mean(smap)

print( smape( np.array([0, 0]),  np.array([0, 0]) ) )
print( smape( np.array([0, 0]),  np.array([0, 1]) ) )
print( smape( np.array([0, 0]),  np.array([1, 0]) ) )
print( smape( np.array([0, 0]),  np.array([1, 1]) ) )

0.0
100.0
100.0
200.0
