In [4]:
import gc
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
# use smape and vsmape to measure the accuracy.
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true != 0) | (y_pred != 0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

def vsmape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true != 0) | (y_pred != 0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * smap

In [4]:
raw = pd.read_csv("../data/preprocessing.csv")
features = ['state_i',
 'mbd_lag_1',
 'act_lag_1',
 'mbd_lag_2',
 'act_lag_2',
 'mbd_lag_3',
 'act_lag_3',
 'mbd_rollmea2_1',
 'mbd_rollmea4_1',
 'mbd_rollmea6_1',
 'neighbor_average',
 'pct_bb',
 'pct_college',
 'pct_foreign_born',
 'pct_it_workers',
 'median_hh_inc']

lgb_params = {'num_leaves': 63,
             'min_data_in_leaf': 32, 
             'objective':'regression',
             'max_depth': -1,
             'learning_rate': 0.01,
             "min_child_samples": 20,
             "boosting": "gbdt",
             "feature_fraction": 0.9,
             "bagging_freq": 1,
             "bagging_fraction": 0.9 ,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "verbosity": -1}

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
raw

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,istest,year,month,...,lasttarget,act_lag_1,mbd_lag_2,act_lag_2,mbd_lag_3,act_lag_3,mbd_rollmea2_1,mbd_rollmea4_1,mbd_rollmea6_1,neighbor_average
0,1001_2020-01-01,1001,Autauga County,Alabama,2020-01-01,2.880232,1242.0,0,2020,1,...,3.313253,,,,,,,,,2214.0
1,1001_2020-02-01,1001,Autauga County,Alabama,2020-02-01,2.909326,1217.0,0,2020,2,...,3.313253,-25.0,,,,,0.010101,0.010101,0.010101,2159.4
2,1001_2020-03-01,1001,Autauga County,Alabama,2020-03-01,2.933231,1227.0,0,2020,3,...,3.313253,10.0,0.010101,-15.0,,,0.018318,0.018318,0.018318,2090.4
3,1001_2020-04-01,1001,Autauga County,Alabama,2020-04-01,3.000167,1255.0,0,2020,4,...,3.313253,28.0,0.008217,38.0,0.010101,13.0,0.031037,0.041138,0.041138,2146.8
4,1001_2020-05-01,1001,Autauga County,Alabama,2020-05-01,3.004948,1257.0,0,2020,5,...,3.313253,2.0,0.022820,30.0,0.008217,40.0,0.024413,0.042731,0.042731,2145.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131638,56045_2023-02-01,56045,Weston County,Wyoming,2023-02-01,,,1,2023,2,...,1.803249,,,,,,,,0.000000,
131639,56045_2023-03-01,56045,Weston County,Wyoming,2023-03-01,,,1,2023,3,...,1.803249,,,,,,,,0.000000,
131640,56045_2023-04-01,56045,Weston County,Wyoming,2023-04-01,,,1,2023,4,...,1.803249,,,,,,,,,
131641,56045_2023-05-01,56045,Weston County,Wyoming,2023-05-01,,,1,2023,5,...,1.803249,,,,,,,,,


In [5]:
ACT_THR = 1.8
ABS_THR = 1.00
raw['ypred_last'] = np.nan
raw['ypred'] = np.nan
raw['k'] = 1.
VAL = []
BEST_ROUNDS = []
for TS in range(29, 38):
    print(TS)
    
    model = CatBoostRegressor(
        learning_rate=0.0075,
        eval_metric='MAE',
        early_stopping_rounds=70,
    )
            
    train_indices = (raw.istest==0) & (raw.dcount  < TS) & (raw.dcount >= 1) & (raw.lastactive>ACT_THR)  & (raw.lasttarget>ABS_THR) 
    valid_indices = (raw.istest==0) & (raw.dcount == TS)
    train_data = lgb.Dataset(raw.loc[train_indices, features], raw.loc[train_indices, 'target'].clip(-0.0043, 0.0045))
    val_data = lgb.Dataset(raw.loc[valid_indices, features], raw.loc[valid_indices, 'target'])
    model = lgb.train(lgb_params, train_data, num_boost_round=20000,
                     valid_sets=[train_data, val_data],
                     verbose_eval=100, early_stopping_rounds=300)
#     model.fit(
#         raw.loc[train_indices, features],
#         raw.loc[train_indices, 'target'].clip(-0.0043, 0.0045),
#         eval_set=[(raw.loc[valid_indices, features], raw.loc[valid_indices, 'target'])],
#         verbose=500,
#     )
    best_rounds = model.best_iteration
    BEST_ROUNDS.append(model.best_iteration)
    ypred = model.predict(raw.loc[valid_indices, features])
    raw.loc[valid_indices, 'k'] = ypred + 1
    raw.loc[valid_indices,'k'] = raw.loc[valid_indices,'k'] * raw.loc[valid_indices,'microbusiness_density']

    # Validate
    lastval = raw.loc[raw.dcount==TS, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']
    dt = raw.loc[raw.dcount==TS, ['cfips', 'k']].set_index('cfips').to_dict()['k']
    
    df = raw.loc[raw.dcount==(TS+1), ['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)
    df['pred'] = df['cfips'].map(dt)
    df['lastval'] = df['cfips'].map(lastval)
    
    df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']
    df.loc[df['lastval']<=ABS_THR, 'pred'] = df.loc[df['lastval']<=ABS_THR, 'lastval']
    raw.loc[raw.dcount==(TS+1), 'ypred'] = df['pred'].values
    raw.loc[raw.dcount==(TS+1), 'ypred_last'] = df['lastval'].values
    
    print(f'TS: {TS}')
    print('Last Value SMAPE:', smape(df['microbusiness_density'], df['lastval']) )
    print('LightGBM SMAPE:', smape(df['microbusiness_density'], df['pred']))
    print()


ind = (raw.dcount>=30)&(raw.dcount<=38)
print( 'LightGBM SMAPE:', smape( raw.loc[ind, 'microbusiness_density'],  raw.loc[ind, 'ypred'] ) )
print( 'Last Value SMAPE:', smape( raw.loc[ind, 'microbusiness_density'],  raw.loc[ind, 'ypred_last'] ) )


29




Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 0.00375852	valid_1's rmse: 0.0195105
[200]	training's rmse: 0.00372429	valid_1's rmse: 0.0195236
[300]	training's rmse: 0.00370211	valid_1's rmse: 0.0195317
Early stopping, best iteration is:
[3]	training's rmse: 0.00383504	valid_1's rmse: 0.0194991
TS: 29
Last Value SMAPE: 1.0868726017655663
LightGBM SMAPE: 1.0834595282726056

30
Training until validation scores don't improve for 300 rounds




[100]	training's rmse: 0.00375715	valid_1's rmse: 0.0214841
[200]	training's rmse: 0.00372466	valid_1's rmse: 0.0214761
[300]	training's rmse: 0.00370358	valid_1's rmse: 0.0214684
[400]	training's rmse: 0.00368662	valid_1's rmse: 0.0214622
[500]	training's rmse: 0.00367239	valid_1's rmse: 0.0214585
[600]	training's rmse: 0.00365987	valid_1's rmse: 0.0214554
[700]	training's rmse: 0.00364841	valid_1's rmse: 0.0214535
[800]	training's rmse: 0.00363741	valid_1's rmse: 0.0214522
[900]	training's rmse: 0.00362706	valid_1's rmse: 0.0214511
[1000]	training's rmse: 0.00361705	valid_1's rmse: 0.0214494
[1100]	training's rmse: 0.00360755	valid_1's rmse: 0.0214476
[1200]	training's rmse: 0.00359796	valid_1's rmse: 0.0214468
[1300]	training's rmse: 0.00358877	valid_1's rmse: 0.021446
[1400]	training's rmse: 0.00357987	valid_1's rmse: 0.0214449
[1500]	training's rmse: 0.00357098	valid_1's rmse: 0.0214442
[1600]	training's rmse: 0.00356255	valid_1's rmse: 0.0214437
[1700]	training's rmse: 0.00355418



[100]	training's rmse: 0.00375819	valid_1's rmse: 0.0206066
[200]	training's rmse: 0.00372471	valid_1's rmse: 0.0206001
[300]	training's rmse: 0.00370249	valid_1's rmse: 0.0205961
[400]	training's rmse: 0.0036855	valid_1's rmse: 0.020594
[500]	training's rmse: 0.00367105	valid_1's rmse: 0.020592
[600]	training's rmse: 0.00365848	valid_1's rmse: 0.0205905
[700]	training's rmse: 0.0036467	valid_1's rmse: 0.0205893
[800]	training's rmse: 0.00363594	valid_1's rmse: 0.0205882
[900]	training's rmse: 0.00362539	valid_1's rmse: 0.0205878
[1000]	training's rmse: 0.0036156	valid_1's rmse: 0.0205871
[1100]	training's rmse: 0.00360625	valid_1's rmse: 0.020587
[1200]	training's rmse: 0.00359698	valid_1's rmse: 0.0205866
[1300]	training's rmse: 0.00358802	valid_1's rmse: 0.0205865
[1400]	training's rmse: 0.00357917	valid_1's rmse: 0.0205864
[1500]	training's rmse: 0.00357064	valid_1's rmse: 0.0205865
[1600]	training's rmse: 0.00356215	valid_1's rmse: 0.0205866
Early stopping, best iteration is:
[135



[100]	training's rmse: 0.00375148	valid_1's rmse: 0.0153585
[200]	training's rmse: 0.00371923	valid_1's rmse: 0.015349
[300]	training's rmse: 0.00369759	valid_1's rmse: 0.0153402
[400]	training's rmse: 0.00368079	valid_1's rmse: 0.0153353
[500]	training's rmse: 0.00366668	valid_1's rmse: 0.0153327
[600]	training's rmse: 0.00365433	valid_1's rmse: 0.0153309
[700]	training's rmse: 0.00364291	valid_1's rmse: 0.0153297
[800]	training's rmse: 0.00363221	valid_1's rmse: 0.015328
[900]	training's rmse: 0.0036221	valid_1's rmse: 0.0153273
[1000]	training's rmse: 0.00361256	valid_1's rmse: 0.0153279
[1100]	training's rmse: 0.00360315	valid_1's rmse: 0.015327
[1200]	training's rmse: 0.00359414	valid_1's rmse: 0.0153267
[1300]	training's rmse: 0.0035853	valid_1's rmse: 0.0153263
[1400]	training's rmse: 0.00357678	valid_1's rmse: 0.0153254
[1500]	training's rmse: 0.00356829	valid_1's rmse: 0.015325
[1600]	training's rmse: 0.00356014	valid_1's rmse: 0.0153254
[1700]	training's rmse: 0.00355221	vali



[100]	training's rmse: 0.00375145	valid_1's rmse: 0.0240109
[200]	training's rmse: 0.00371843	valid_1's rmse: 0.0240229
[300]	training's rmse: 0.0036963	valid_1's rmse: 0.0240222
Early stopping, best iteration is:
[1]	training's rmse: 0.00382496	valid_1's rmse: 0.0239706
TS: 33
Last Value SMAPE: 1.3686285670946152
LightGBM SMAPE: 1.3618169719339643

34
Training until validation scores don't improve for 300 rounds




[100]	training's rmse: 0.00375354	valid_1's rmse: 0.0308983
[200]	training's rmse: 0.00372141	valid_1's rmse: 0.0308582
[300]	training's rmse: 0.00369962	valid_1's rmse: 0.0308288
[400]	training's rmse: 0.00368305	valid_1's rmse: 0.0308088
[500]	training's rmse: 0.00366932	valid_1's rmse: 0.0307951
[600]	training's rmse: 0.00365745	valid_1's rmse: 0.0307853
[700]	training's rmse: 0.00364648	valid_1's rmse: 0.0307779
[800]	training's rmse: 0.00363641	valid_1's rmse: 0.0307735
[900]	training's rmse: 0.00362686	valid_1's rmse: 0.0307696
[1000]	training's rmse: 0.0036176	valid_1's rmse: 0.0307665
[1100]	training's rmse: 0.00360883	valid_1's rmse: 0.0307645
[1200]	training's rmse: 0.0036001	valid_1's rmse: 0.0307618
[1300]	training's rmse: 0.00359171	valid_1's rmse: 0.0307595
[1400]	training's rmse: 0.00358359	valid_1's rmse: 0.0307572
[1500]	training's rmse: 0.00357552	valid_1's rmse: 0.0307552
[1600]	training's rmse: 0.00356752	valid_1's rmse: 0.0307537
[1700]	training's rmse: 0.00355995	



[100]	training's rmse: 0.003755	valid_1's rmse: 0.0212158
[200]	training's rmse: 0.003721	valid_1's rmse: 0.0212345
[300]	training's rmse: 0.00369831	valid_1's rmse: 0.0212427
Early stopping, best iteration is:
[1]	training's rmse: 0.00383021	valid_1's rmse: 0.0211796
TS: 35
Last Value SMAPE: 1.2797936949214384
LightGBM SMAPE: 1.2887352565122097

36
Training until validation scores don't improve for 300 rounds




[100]	training's rmse: 0.00376258	valid_1's rmse: 0.018404
[200]	training's rmse: 0.00372933	valid_1's rmse: 0.0184176
[300]	training's rmse: 0.00370741	valid_1's rmse: 0.0184273
Early stopping, best iteration is:
[1]	training's rmse: 0.00383475	valid_1's rmse: 0.0183886
TS: 36
Last Value SMAPE: 1.034314865882525
LightGBM SMAPE: 1.0271997884072392

37
Training until validation scores don't improve for 300 rounds




[100]	training's rmse: 0.00375872	valid_1's rmse: 0.140753
[200]	training's rmse: 0.00372691	valid_1's rmse: 0.140751
[300]	training's rmse: 0.00370602	valid_1's rmse: 0.140749
[400]	training's rmse: 0.0036902	valid_1's rmse: 0.140748
[500]	training's rmse: 0.00367672	valid_1's rmse: 0.140747
[600]	training's rmse: 0.00366488	valid_1's rmse: 0.140747
[700]	training's rmse: 0.00365432	valid_1's rmse: 0.140746
[800]	training's rmse: 0.00364449	valid_1's rmse: 0.140746
[900]	training's rmse: 0.00363504	valid_1's rmse: 0.140745
[1000]	training's rmse: 0.00362603	valid_1's rmse: 0.140745
[1100]	training's rmse: 0.00361759	valid_1's rmse: 0.140745
[1200]	training's rmse: 0.0036094	valid_1's rmse: 0.140744
[1300]	training's rmse: 0.00360147	valid_1's rmse: 0.140744
[1400]	training's rmse: 0.00359356	valid_1's rmse: 0.140743
[1500]	training's rmse: 0.00358592	valid_1's rmse: 0.140744
[1600]	training's rmse: 0.00357839	valid_1's rmse: 0.140743
[1700]	training's rmse: 0.00357114	valid_1's rmse: 

In [6]:
# prediction
raw['error'] = vsmape(raw['microbusiness_density'], raw['ypred'])
raw['error_last'] = vsmape(raw['microbusiness_density'], raw['ypred_last'])
raw.loc[(raw.dcount==30), ['microbusiness_density', 'ypred', 'error', 'error_last'] ]

Unnamed: 0,microbusiness_density,ypred,error,error_last
30,3.334431,3.297477,1.114430,1.135557
77,7.823300,7.735059,1.134324,1.155810
124,1.206827,1.186877,1.666887,1.687769
171,1.236650,1.214426,1.813369,1.834867
218,1.777708,1.753268,1.384323,1.403959
...,...,...,...,...
147140,2.892446,2.926768,1.179620,1.179620
147187,25.438322,25.537658,0.389736,0.368550
147234,3.954258,3.755186,5.164358,5.183206
147281,3.027295,3.027295,0.000000,0.000000
