In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_pickle("cleaned_train_dataset.pkl")

display(df.head(3))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,smoothed_wcovid_vaccinated,smoothed_wvaccine_likely_friends,smoothed_wrestaurant_1d,smoothed_wvaccine_likely_politicians,smoothed_wvaccine_likely_who,smoothed_wwearing_mask,smoothed_wlarge_event_1d,State,County Name,day_of_week
2,2021-01-09,10001,30.564677,31.67157,3.529032,62.536156,61.541969,26.712798,48.027841,36.15132,...,5.692845,29.742181,14.161936,12.41852,31.979435,91.884639,8.088518,Delaware,Kent,Saturday
3,2021-01-10,10001,28.771539,30.455099,3.38676,63.258488,59.657602,28.732035,48.997453,36.207953,...,5.72106,31.253956,12.912095,12.66799,32.232582,92.625535,7.548629,Delaware,Kent,Sunday
4,2021-01-11,10001,27.648192,32.652473,2.836477,64.020938,62.971697,28.824431,49.233749,34.277144,...,5.08141,34.085365,12.258112,14.202718,32.396997,94.077965,7.615596,Delaware,Kent,Monday


Shape:  (9349, 22)


In [3]:
df.isnull().sum()

time_value                              0
geo_value                               0
smoothed_wspent_time_1d                 0
smoothed_wtested_14d                    0
smoothed_wpublic_transit_1d             0
smoothed_wcovid_vaccinated_or_accept    0
smoothed_wworried_become_ill            0
smoothed_wvaccine_likely_govt_health    0
smoothed_wshop_1d                       0
smoothed_wwork_outside_home_1d          0
smoothed_wothers_masked                 0
smoothed_wcli                           0
smoothed_wcovid_vaccinated              0
smoothed_wvaccine_likely_friends        0
smoothed_wrestaurant_1d                 0
smoothed_wvaccine_likely_politicians    0
smoothed_wvaccine_likely_who            0
smoothed_wwearing_mask                  0
smoothed_wlarge_event_1d                0
State                                   0
County Name                             0
day_of_week                             0
dtype: int64

In [4]:
list_transform_features = [a_col for a_col in df.columns if a_col not in ['time_value','geo_value','State','County Name','day_of_week','smoothed_wtested_14d','smoothed_wtested_positive_14d']]
list_transform_features

['smoothed_wspent_time_1d',
 'smoothed_wpublic_transit_1d',
 'smoothed_wcovid_vaccinated_or_accept',
 'smoothed_wworried_become_ill',
 'smoothed_wvaccine_likely_govt_health',
 'smoothed_wshop_1d',
 'smoothed_wwork_outside_home_1d',
 'smoothed_wothers_masked',
 'smoothed_wcli',
 'smoothed_wcovid_vaccinated',
 'smoothed_wvaccine_likely_friends',
 'smoothed_wrestaurant_1d',
 'smoothed_wvaccine_likely_politicians',
 'smoothed_wvaccine_likely_who',
 'smoothed_wwearing_mask',
 'smoothed_wlarge_event_1d']

##  Entity-Level Dynamic
how past values of a variable for the same entity affect its current value. It's about dynamic behavior within an individual over time.

Is it ok if each geo_value have only 14 records:
- 7,000+ rows = enough to train LightGBM and avoid overfitting.
- Lag features are crucial for time-aware behavior (e.g., "cases rise 7 days after mobility increases").
- Losing a few days of data per geo is acceptable if lag features significantly improve model performance.

In [5]:
# Lagged of smoothed_wcovid_vaccinated
# Assumption: If a county had high vaccination last week, it's likely to have similar or increasing levels this week.
# lag_smoothed_wcovid_vaccinated_3: capture previous 3 days vaccination level (short run effect)
# lag_smoothed_wcovid_vaccinated_7: capture previous last week's vaccination level (weekly effect)
# smoothed_wtested_14d don't lag becaues it cover last 14 days
# smoothed_wtested_positive_14d don't lag becaues it cover last 14 days
for a_col in list_transform_features:
    df['%s_lag_3'%a_col] = df.groupby('geo_value')['%s'%a_col].shift(3)

In [6]:
display(df[['time_value','geo_value','smoothed_wcovid_vaccinated','smoothed_wcovid_vaccinated_lag_3']].head(43))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wcovid_vaccinated,smoothed_wcovid_vaccinated_lag_3
2,2021-01-09,10001,5.692845,
3,2021-01-10,10001,5.72106,
4,2021-01-11,10001,5.08141,
5,2021-01-12,10001,4.789884,5.692845
6,2021-01-13,10001,5.048679,5.72106
7,2021-01-14,10001,5.813447,5.08141
8,2021-01-15,10001,5.937796,4.789884
9,2021-01-16,10001,6.138341,5.048679
10,2021-01-17,10001,7.787255,5.813447
11,2021-01-18,10001,13.644425,5.937796


Shape:  (9349, 38)


In [7]:
rolling_features = {}
list_select_period = [3]

for a_col in list_transform_features:
    for a_lag in list_select_period:
        roll_mean = df.groupby(['geo_value'])[a_col].rolling(window=a_lag).mean().reset_index(level=0, drop=True)
        mean_col_name = f'{a_col}_rolling_mean_{a_lag}'
        rolling_features[mean_col_name] = roll_mean

df_rolling = pd.DataFrame(rolling_features)
df = pd.concat([df, df_rolling], axis=1)

In [8]:
display(df[['time_value','geo_value','smoothed_wcovid_vaccinated','smoothed_wcovid_vaccinated_rolling_mean_3']].head(50))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wcovid_vaccinated,smoothed_wcovid_vaccinated_rolling_mean_3
2,2021-01-09,10001,5.692845,
3,2021-01-10,10001,5.72106,
4,2021-01-11,10001,5.08141,5.498438
5,2021-01-12,10001,4.789884,5.197451
6,2021-01-13,10001,5.048679,4.973324
7,2021-01-14,10001,5.813447,5.217337
8,2021-01-15,10001,5.937796,5.599974
9,2021-01-16,10001,6.138341,5.963195
10,2021-01-17,10001,7.787255,6.621131
11,2021-01-18,10001,13.644425,9.190007


Shape:  (9349, 54)


In [9]:
for a_col in list_transform_features:
    for a_lag in list_select_period:
        df[f'{a_col}_rolling_mean_{a_lag}'] = df.groupby('geo_value')[f'{a_col}_rolling_mean_{a_lag}'].shift(1)

In [10]:
display(df[['time_value','geo_value','smoothed_wcovid_vaccinated','smoothed_wcovid_vaccinated_rolling_mean_3']].head(50))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wcovid_vaccinated,smoothed_wcovid_vaccinated_rolling_mean_3
2,2021-01-09,10001,5.692845,
3,2021-01-10,10001,5.72106,
4,2021-01-11,10001,5.08141,
5,2021-01-12,10001,4.789884,5.498438
6,2021-01-13,10001,5.048679,5.197451
7,2021-01-14,10001,5.813447,4.973324
8,2021-01-15,10001,5.937796,5.217337
9,2021-01-16,10001,6.138341,5.599974
10,2021-01-17,10001,7.787255,5.963195
11,2021-01-18,10001,13.644425,6.621131


Shape:  (9349, 54)


In [11]:
df.isnull().sum()

time_value                                                0
geo_value                                                 0
smoothed_wspent_time_1d                                   0
smoothed_wtested_14d                                      0
smoothed_wpublic_transit_1d                               0
smoothed_wcovid_vaccinated_or_accept                      0
smoothed_wworried_become_ill                              0
smoothed_wvaccine_likely_govt_health                      0
smoothed_wshop_1d                                         0
smoothed_wwork_outside_home_1d                            0
smoothed_wothers_masked                                   0
smoothed_wcli                                             0
smoothed_wcovid_vaccinated                                0
smoothed_wvaccine_likely_friends                          0
smoothed_wrestaurant_1d                                   0
smoothed_wvaccine_likely_politicians                      0
smoothed_wvaccine_likely_who            

In [12]:
df = df.dropna().reset_index(drop = True)
display(df.head(50))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,smoothed_wwork_outside_home_1d_rolling_mean_3,smoothed_wothers_masked_rolling_mean_3,smoothed_wcli_rolling_mean_3,smoothed_wcovid_vaccinated_rolling_mean_3,smoothed_wvaccine_likely_friends_rolling_mean_3,smoothed_wrestaurant_1d_rolling_mean_3,smoothed_wvaccine_likely_politicians_rolling_mean_3,smoothed_wvaccine_likely_who_rolling_mean_3,smoothed_wwearing_mask_rolling_mean_3,smoothed_wlarge_event_1d_rolling_mean_3
0,2021-01-12,10001,27.626437,29.69581,2.245193,61.46046,59.611104,28.867446,45.176114,37.305956,...,35.545472,94.328659,1.391942,5.498438,31.693834,13.110715,13.09641,32.203005,92.862713,7.750914
1,2021-01-13,10001,29.008665,26.041461,2.35536,64.432266,56.863993,29.842478,45.55787,39.091765,...,35.930351,94.946933,1.221664,5.197451,32.464412,12.015379,13.925718,32.280096,94.570246,6.675872
2,2021-01-14,10001,29.180034,26.490986,1.905282,68.381496,64.845904,34.533212,47.707745,37.870228,...,36.891622,95.130651,1.198439,4.973324,33.678367,11.211267,15.048809,31.694703,95.905057,5.862543
3,2021-01-15,10001,26.513712,23.276687,0.759386,66.774871,66.419856,36.235658,48.965335,36.056104,...,38.089316,95.904936,1.111375,5.217337,34.525848,9.834074,15.988707,31.925641,97.110419,4.354745
4,2021-01-16,10001,27.418453,22.906015,0.788315,68.348608,64.755157,34.039471,44.205572,29.356264,...,37.672699,96.03658,1.246195,5.599974,35.986088,9.117032,16.267727,32.05251,97.528467,3.806274
5,2021-01-17,10001,32.24711,21.815456,0.759912,67.651157,66.835197,33.103631,44.885952,31.238444,...,34.427532,95.609006,1.314425,5.963195,36.097834,8.270968,14.910377,31.642311,98.0244,3.257064
6,2021-01-18,10001,33.856236,18.614158,0.719359,69.65192,64.043703,33.968587,47.532909,34.266392,...,32.216937,94.598677,1.627025,6.621131,35.73104,9.071278,13.08543,29.600714,97.911018,3.856685
7,2021-01-19,10001,30.792506,19.385734,0.739362,71.540206,71.38857,32.834977,52.183012,30.707266,...,31.620367,93.937466,1.832079,9.190007,36.069901,9.807243,11.071319,28.151385,97.264389,3.903802
8,2021-01-20,10001,29.503575,20.220984,0.970777,71.677764,68.349596,30.958924,51.19951,29.811271,...,32.070701,94.286434,2.029164,12.65562,36.592055,11.123981,9.219451,27.4804,96.795294,4.003178
9,2021-01-21,10001,30.498854,18.669842,1.192087,72.025205,67.445976,27.952021,51.215558,31.277973,...,31.594976,94.677102,2.083598,16.837775,36.459593,11.484834,7.251286,27.575348,96.241531,3.713578


Shape:  (8038, 54)


In [13]:
df.groupby('geo_value').agg({"time_value":"nunique"})

Unnamed: 0_level_0,time_value
geo_value,Unnamed: 1_level_1
10001,18
10003,20
10005,19
11001,19
12001,19
...,...
55087,18
55105,19
55133,18
55139,18


# Interaction

In [14]:
list(df.columns)[:20]

['time_value',
 'geo_value',
 'smoothed_wspent_time_1d',
 'smoothed_wtested_14d',
 'smoothed_wpublic_transit_1d',
 'smoothed_wcovid_vaccinated_or_accept',
 'smoothed_wworried_become_ill',
 'smoothed_wvaccine_likely_govt_health',
 'smoothed_wshop_1d',
 'smoothed_wwork_outside_home_1d',
 'smoothed_wothers_masked',
 'smoothed_wcli',
 'smoothed_wcovid_vaccinated',
 'smoothed_wvaccine_likely_friends',
 'smoothed_wrestaurant_1d',
 'smoothed_wvaccine_likely_politicians',
 'smoothed_wvaccine_likely_who',
 'smoothed_wwearing_mask',
 'smoothed_wlarge_event_1d',
 'State']

In [15]:
df['smoothed_wworried_become_ill_x_smoothed_wwork_outside_home_1d'] = df['smoothed_wworried_become_ill'] * df['smoothed_wwork_outside_home_1d'] / 100
df['smoothed_wworried_become_ill_x_smoothed_wspent_time_1d'] = df['smoothed_wworried_become_ill'] * df['smoothed_wspent_time_1d'] / 100
df['smoothed_wcli_lag_3_x_smoothed_wwork_outside_home_1d'] = df['smoothed_wcli_lag_3'] * df['smoothed_wwork_outside_home_1d'] / 100
df['smoothed_wothers_masked_x_smoothed_wcovid_vaccinated_or_accept'] = df['smoothed_wothers_masked'] * df['smoothed_wcovid_vaccinated_or_accept'] / 100
df['smoothed_wtested_14d_x_smoothed_wvaccine_likely_govt_health'] = df['smoothed_wtested_14d'] * df['smoothed_wvaccine_likely_govt_health'] / 100
df['smoothed_wcli_x_smoothed_wcovid_vaccinated_or_accept'] = df['smoothed_wcli'] * df['smoothed_wcovid_vaccinated_or_accept'] / 100
df['smoothed_wcovid_vaccinated_or_accept_x_smoothed_wvaccine_likely_friends'] = df['smoothed_wcovid_vaccinated_or_accept'] * df['smoothed_wvaccine_likely_friends'] / 100

In [16]:
df.describe().transpose().head(50)

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
time_value,8038.0,2021-01-20 06:03:40.353321728,2021-01-10 00:00:00,2021-01-16 00:00:00,2021-01-20 00:00:00,2021-01-25 00:00:00,2021-01-29 00:00:00,
smoothed_wspent_time_1d,8038.0,30.243075,14.410554,26.873611,30.119953,33.567895,50.074606,5.103771
smoothed_wtested_14d,8038.0,13.50212,3.055736,10.254036,12.774248,15.915495,36.044534,4.689877
smoothed_wpublic_transit_1d,8038.0,2.907582,0.095057,1.602539,2.382377,3.365666,30.806952,2.845063
smoothed_wcovid_vaccinated_or_accept,8038.0,75.300769,53.251975,70.395629,75.706581,80.346372,96.517667,7.288931
smoothed_wworried_become_ill,8038.0,69.079999,45.306264,65.826249,69.337391,72.635411,93.900646,5.322295
smoothed_wvaccine_likely_govt_health,8038.0,31.698358,13.090878,27.47782,31.512647,35.748352,54.544048,6.341361
smoothed_wshop_1d,8038.0,51.466333,35.749073,48.58216,51.215127,54.134505,66.970956,4.277135
smoothed_wwork_outside_home_1d,8038.0,32.671938,14.582339,29.210722,32.41301,35.934703,53.549109,5.209661
smoothed_wothers_masked,8038.0,82.666207,23.828027,77.636939,86.423526,91.2487,98.920178,11.790088


## Time Trend
A systematic change over time that affects all entities in the same way. Time trend captures common evolution over time

In [17]:
# Day of Week
day_of_week_dummies = pd.get_dummies(df['day_of_week'], prefix='week_no')
day_of_week_dummies = day_of_week_dummies.astype(int)
day_of_week_dummies['dummy_weekend'] = day_of_week_dummies['week_no_Saturday'] + day_of_week_dummies['week_no_Sunday']
day_of_week_dummies

Unnamed: 0,week_no_Monday,week_no_Tuesday,week_no_Wednesday,week_no_Thursday,week_no_Friday,week_no_Saturday,week_no_Sunday,dummy_weekend
0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0
4,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...
8033,1,0,0,0,0,0,0,0
8034,0,1,0,0,0,0,0,0
8035,0,0,1,0,0,0,0,0
8036,0,0,0,1,0,0,0,0


In [18]:
df = pd.concat([df, day_of_week_dummies[['dummy_weekend']]], axis=1)
display(df.head())
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,smoothed_wwearing_mask_rolling_mean_3,smoothed_wlarge_event_1d_rolling_mean_3,smoothed_wworried_become_ill_x_smoothed_wwork_outside_home_1d,smoothed_wworried_become_ill_x_smoothed_wspent_time_1d,smoothed_wcli_lag_3_x_smoothed_wwork_outside_home_1d,smoothed_wothers_masked_x_smoothed_wcovid_vaccinated_or_accept,smoothed_wtested_14d_x_smoothed_wvaccine_likely_govt_health,smoothed_wcli_x_smoothed_wcovid_vaccinated_or_accept,smoothed_wcovid_vaccinated_or_accept_x_smoothed_wvaccine_likely_friends,dummy_weekend
0,2021-01-12,10001,27.626437,29.69581,2.245193,61.46046,59.611104,28.867446,45.176114,37.305956,...,92.862713,7.750914,22.238492,16.468424,0.606301,58.900212,8.572422,0.684904,19.700484,0
1,2021-01-13,10001,29.008665,26.041461,2.35536,64.432266,56.863993,29.842478,45.55787,39.091765,...,94.570246,6.675872,22.229138,16.495485,0.540161,61.461767,7.771417,0.845417,22.484168,0
2,2021-01-14,10001,29.180034,26.490986,1.905282,68.381496,64.845904,34.533212,47.707745,37.870228,...,95.905057,5.862543,24.557292,18.922057,0.44264,65.981823,9.148188,0.620658,25.046643,0
3,2021-01-15,10001,26.513712,23.276687,0.759386,66.774871,66.419856,36.235658,48.965335,36.056104,...,97.110419,4.354745,23.948412,17.610369,0.401803,64.256957,8.43446,1.014205,24.32918,0
4,2021-01-16,10001,27.418453,22.906015,0.788315,68.348608,64.755157,34.039471,44.205572,29.356264,...,97.528467,3.806274,19.009695,17.754862,0.385184,64.320834,7.797086,1.036708,24.079938,1


Shape:  (8038, 62)


## Fixed Effects
Fixed effects control for entity-specific characteristics that don’t change over time but may influence the dependent variable.

In [19]:
# Stage Number
stage_dummies = pd.get_dummies(df['State'], prefix='dummy')
stage_dummies = stage_dummies.drop(columns = {"dummy_Hawaii"})
stage_dummies = stage_dummies.astype(int)
df = pd.concat([df, stage_dummies], axis=1)
display(df.head())
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,dummy_South Dakota,dummy_Tennessee,dummy_Texas,dummy_Utah,dummy_Vermont,dummy_Virginia,dummy_Washington,dummy_West Virginia,dummy_Wisconsin,dummy_Wyoming
0,2021-01-12,10001,27.626437,29.69581,2.245193,61.46046,59.611104,28.867446,45.176114,37.305956,...,0,0,0,0,0,0,0,0,0,0
1,2021-01-13,10001,29.008665,26.041461,2.35536,64.432266,56.863993,29.842478,45.55787,39.091765,...,0,0,0,0,0,0,0,0,0,0
2,2021-01-14,10001,29.180034,26.490986,1.905282,68.381496,64.845904,34.533212,47.707745,37.870228,...,0,0,0,0,0,0,0,0,0,0
3,2021-01-15,10001,26.513712,23.276687,0.759386,66.774871,66.419856,36.235658,48.965335,36.056104,...,0,0,0,0,0,0,0,0,0,0
4,2021-01-16,10001,27.418453,22.906015,0.788315,68.348608,64.755157,34.039471,44.205572,29.356264,...,0,0,0,0,0,0,0,0,0,0


Shape:  (8038, 105)


In [20]:
df.columns = df.columns.str.strip().str.replace(' ', '_')
df.columns = df.columns.str.lower()

display(df.head())
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,dummy_south_dakota,dummy_tennessee,dummy_texas,dummy_utah,dummy_vermont,dummy_virginia,dummy_washington,dummy_west_virginia,dummy_wisconsin,dummy_wyoming
0,2021-01-12,10001,27.626437,29.69581,2.245193,61.46046,59.611104,28.867446,45.176114,37.305956,...,0,0,0,0,0,0,0,0,0,0
1,2021-01-13,10001,29.008665,26.041461,2.35536,64.432266,56.863993,29.842478,45.55787,39.091765,...,0,0,0,0,0,0,0,0,0,0
2,2021-01-14,10001,29.180034,26.490986,1.905282,68.381496,64.845904,34.533212,47.707745,37.870228,...,0,0,0,0,0,0,0,0,0,0
3,2021-01-15,10001,26.513712,23.276687,0.759386,66.774871,66.419856,36.235658,48.965335,36.056104,...,0,0,0,0,0,0,0,0,0,0
4,2021-01-16,10001,27.418453,22.906015,0.788315,68.348608,64.755157,34.039471,44.205572,29.356264,...,0,0,0,0,0,0,0,0,0,0


Shape:  (8038, 105)


In [21]:
df_null = pd.DataFrame(df.isnull().sum(), columns = ['NULL'])
df_null[df_null['NULL']>0]

Unnamed: 0,NULL


In [22]:
df.to_pickle("feature_engineering_train_dataset.pkl")