In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_pickle("cleaned_test_dataset.pkl")

display(df.head(3))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,smoothed_wcovid_vaccinated,smoothed_wvaccine_likely_friends,smoothed_wrestaurant_1d,smoothed_wvaccine_likely_politicians,smoothed_wvaccine_likely_who,smoothed_wwearing_mask,smoothed_wlarge_event_1d,State,County Name,day_of_week
0,2021-02-06,10001,32.699199,19.187633,8.791025,68.827417,65.663218,26.680991,65.330731,48.18301,...,20.02608,37.648337,27.671691,4.305266,30.862734,94.674122,7.642234,Delaware,Kent,Saturday
1,2021-02-07,10001,36.819275,15.249906,8.246346,68.527244,64.95592,26.667758,62.730589,46.288597,...,23.522186,37.956024,27.388525,5.361903,30.822502,94.978346,6.125545,Delaware,Kent,Sunday
2,2021-02-08,10001,37.701923,16.060492,7.767191,68.869323,63.241518,27.913755,63.037248,46.398122,...,22.507475,37.308655,27.576218,4.921152,30.08284,95.138547,5.769898,Delaware,Kent,Monday


Shape:  (2750, 22)


In [3]:
df.isnull().sum()

time_value                              0
geo_value                               0
smoothed_wspent_time_1d                 0
smoothed_wtested_14d                    0
smoothed_wpublic_transit_1d             0
smoothed_wcovid_vaccinated_or_accept    0
smoothed_wworried_become_ill            0
smoothed_wvaccine_likely_govt_health    0
smoothed_wshop_1d                       0
smoothed_wwork_outside_home_1d          0
smoothed_wothers_masked                 0
smoothed_wcli                           0
smoothed_wcovid_vaccinated              0
smoothed_wvaccine_likely_friends        0
smoothed_wrestaurant_1d                 0
smoothed_wvaccine_likely_politicians    0
smoothed_wvaccine_likely_who            0
smoothed_wwearing_mask                  0
smoothed_wlarge_event_1d                0
State                                   0
County Name                             0
day_of_week                             0
dtype: int64

In [4]:
list_transform_features = [a_col for a_col in df.columns if a_col not in ['time_value','geo_value','State','County Name','day_of_week','smoothed_wtested_14d','smoothed_wtested_positive_14d']]
list_transform_features

['smoothed_wspent_time_1d',
 'smoothed_wpublic_transit_1d',
 'smoothed_wcovid_vaccinated_or_accept',
 'smoothed_wworried_become_ill',
 'smoothed_wvaccine_likely_govt_health',
 'smoothed_wshop_1d',
 'smoothed_wwork_outside_home_1d',
 'smoothed_wothers_masked',
 'smoothed_wcli',
 'smoothed_wcovid_vaccinated',
 'smoothed_wvaccine_likely_friends',
 'smoothed_wrestaurant_1d',
 'smoothed_wvaccine_likely_politicians',
 'smoothed_wvaccine_likely_who',
 'smoothed_wwearing_mask',
 'smoothed_wlarge_event_1d']

##  Entity-Level Dynamic
how past values of a variable for the same entity affect its current value. It's about dynamic behavior within an individual over time.

Is it ok if each geo_value have only 14 records:
- 7,000+ rows = enough to train LightGBM and avoid overfitting.
- Lag features are crucial for time-aware behavior (e.g., "cases rise 7 days after mobility increases").
- Losing a few days of data per geo is acceptable if lag features significantly improve model performance.

In [5]:
# Lagged of smoothed_wcovid_vaccinated
# Assumption: If a county had high vaccination last week, it's likely to have similar or increasing levels this week.
# lag_smoothed_wcovid_vaccinated_3: capture previous 3 days vaccination level (short run effect)
# lag_smoothed_wcovid_vaccinated_7: capture previous last week's vaccination level (weekly effect)
# smoothed_wtested_14d don't lag becaues it cover last 14 days
# smoothed_wtested_positive_14d don't lag becaues it cover last 14 days
for a_col in list_transform_features:
    df['%s_lag_3'%a_col] = df.groupby('geo_value')['%s'%a_col].shift(3)

In [6]:
display(df[['time_value','geo_value','smoothed_wcovid_vaccinated','smoothed_wcovid_vaccinated_lag_3']].head(43))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wcovid_vaccinated,smoothed_wcovid_vaccinated_lag_3
0,2021-02-06,10001,20.02608,
1,2021-02-07,10001,23.522186,
2,2021-02-08,10001,22.507475,
3,2021-02-09,10001,23.062367,20.02608
4,2021-02-10,10001,24.843072,23.522186
5,2021-02-11,10001,22.970572,22.507475
6,2021-02-12,10001,24.430396,23.062367
7,2021-02-06,10003,19.640517,
8,2021-02-07,10003,20.368653,
9,2021-02-08,10003,21.293021,


Shape:  (2750, 38)


In [7]:
rolling_features = {}
list_select_period = [3]

for a_col in list_transform_features:
    for a_lag in list_select_period:
        roll_mean = df.groupby(['geo_value'])[a_col].rolling(window=a_lag).mean().reset_index(level=0, drop=True)
        mean_col_name = f'{a_col}_rolling_mean_{a_lag}'
        rolling_features[mean_col_name] = roll_mean

df_rolling = pd.DataFrame(rolling_features)
df = pd.concat([df, df_rolling], axis=1)

In [8]:
display(df[['time_value','geo_value','smoothed_wcovid_vaccinated','smoothed_wcovid_vaccinated_rolling_mean_3']].head(50))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wcovid_vaccinated,smoothed_wcovid_vaccinated_rolling_mean_3
0,2021-02-06,10001,20.02608,
1,2021-02-07,10001,23.522186,
2,2021-02-08,10001,22.507475,22.01858
3,2021-02-09,10001,23.062367,23.030676
4,2021-02-10,10001,24.843072,23.470971
5,2021-02-11,10001,22.970572,23.625337
6,2021-02-12,10001,24.430396,24.081347
7,2021-02-06,10003,19.640517,
8,2021-02-07,10003,20.368653,
9,2021-02-08,10003,21.293021,20.434064


Shape:  (2750, 54)


In [9]:
for a_col in list_transform_features:
    for a_lag in list_select_period:
        df[f'{a_col}_rolling_mean_{a_lag}'] = df.groupby('geo_value')[f'{a_col}_rolling_mean_{a_lag}'].shift(1)

In [10]:
display(df[['time_value','geo_value','smoothed_wcovid_vaccinated','smoothed_wcovid_vaccinated_rolling_mean_3']].head(50))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wcovid_vaccinated,smoothed_wcovid_vaccinated_rolling_mean_3
0,2021-02-06,10001,20.02608,
1,2021-02-07,10001,23.522186,
2,2021-02-08,10001,22.507475,
3,2021-02-09,10001,23.062367,22.01858
4,2021-02-10,10001,24.843072,23.030676
5,2021-02-11,10001,22.970572,23.470971
6,2021-02-12,10001,24.430396,23.625337
7,2021-02-06,10003,19.640517,
8,2021-02-07,10003,20.368653,
9,2021-02-08,10003,21.293021,


Shape:  (2750, 54)


In [11]:
df.isnull().sum()

time_value                                                0
geo_value                                                 0
smoothed_wspent_time_1d                                   0
smoothed_wtested_14d                                      0
smoothed_wpublic_transit_1d                               0
smoothed_wcovid_vaccinated_or_accept                      0
smoothed_wworried_become_ill                              0
smoothed_wvaccine_likely_govt_health                      0
smoothed_wshop_1d                                         0
smoothed_wwork_outside_home_1d                            0
smoothed_wothers_masked                                   0
smoothed_wcli                                             0
smoothed_wcovid_vaccinated                                0
smoothed_wvaccine_likely_friends                          0
smoothed_wrestaurant_1d                                   0
smoothed_wvaccine_likely_politicians                      0
smoothed_wvaccine_likely_who            

In [12]:
df = df.dropna().reset_index(drop = True)
display(df.head(50))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,smoothed_wwork_outside_home_1d_rolling_mean_3,smoothed_wothers_masked_rolling_mean_3,smoothed_wcli_rolling_mean_3,smoothed_wcovid_vaccinated_rolling_mean_3,smoothed_wvaccine_likely_friends_rolling_mean_3,smoothed_wrestaurant_1d_rolling_mean_3,smoothed_wvaccine_likely_politicians_rolling_mean_3,smoothed_wvaccine_likely_who_rolling_mean_3,smoothed_wwearing_mask_rolling_mean_3,smoothed_wlarge_event_1d_rolling_mean_3
0,2021-02-09,10001,36.782852,15.77906,7.452131,71.744309,63.479295,31.768168,60.557307,40.272843,...,46.956576,86.967849,0.532273,22.01858,37.637672,27.545478,4.862773,30.589359,94.930338,6.512559
1,2021-02-10,10001,33.319952,12.837715,2.041177,72.171535,65.314007,28.52832,63.845535,38.435611,...,44.319854,86.282273,0.522967,23.030676,38.034292,26.741454,5.278243,31.758634,94.914101,5.810141
2,2021-02-11,10001,36.710308,14.203521,2.125673,68.555617,64.170146,31.0201,64.243291,39.718812,...,41.702192,87.285539,0.51815,23.470971,36.915963,24.467085,5.702121,32.396922,94.546291,5.424958
3,2021-02-12,10001,36.219647,15.232744,1.477105,72.127871,60.310761,32.0248,66.073791,36.354908,...,39.475755,88.322404,0.671922,23.625337,36.372293,22.358503,6.8442,33.456327,93.910664,5.030087
4,2021-02-09,10003,35.4718,17.469862,2.019208,82.620697,70.510884,33.113208,57.594607,31.909814,...,33.474113,90.675422,0.753434,20.434064,38.595478,14.556031,11.610791,37.814411,95.003618,6.782084
5,2021-02-10,10003,39.295765,19.214168,2.129242,82.647912,70.127041,32.242405,59.024216,33.449583,...,32.706773,91.573524,0.779824,20.233233,40.751533,13.640178,11.205921,38.152691,95.629209,6.089325
6,2021-02-11,10003,38.01859,19.465982,2.806468,82.43269,68.694883,31.822838,59.431513,32.916682,...,32.708331,92.412977,0.670868,19.214703,41.726311,13.208282,10.899104,38.52779,96.297901,5.684877
7,2021-02-12,10003,35.744458,18.085223,2.8387,81.036502,68.254607,31.920761,56.85028,31.162847,...,32.758693,93.274988,0.599445,18.108489,41.269198,13.595529,10.412245,38.603329,96.694705,5.687314
8,2021-02-09,10005,28.302986,18.279526,2.053015,76.800535,69.657325,35.448268,59.476823,33.500262,...,35.191288,90.506277,0.284185,22.391103,28.737792,21.877523,11.838485,31.948337,93.177636,8.326559
9,2021-02-10,10005,29.053794,19.201394,1.589194,77.967082,66.807383,35.643209,58.385247,32.232714,...,34.733073,91.169318,0.421764,23.970946,29.997531,22.584374,13.328537,33.288258,93.52056,7.784731


Shape:  (1560, 54)


In [13]:
df.groupby('geo_value').agg({"time_value":"nunique"})

Unnamed: 0_level_0,time_value
geo_value,Unnamed: 1_level_1
10001,4
10003,4
10005,4
11001,4
12001,4
...,...
55087,4
55101,4
55105,4
55133,4


# Interaction

In [14]:
list(df.columns)[:20]

['time_value',
 'geo_value',
 'smoothed_wspent_time_1d',
 'smoothed_wtested_14d',
 'smoothed_wpublic_transit_1d',
 'smoothed_wcovid_vaccinated_or_accept',
 'smoothed_wworried_become_ill',
 'smoothed_wvaccine_likely_govt_health',
 'smoothed_wshop_1d',
 'smoothed_wwork_outside_home_1d',
 'smoothed_wothers_masked',
 'smoothed_wcli',
 'smoothed_wcovid_vaccinated',
 'smoothed_wvaccine_likely_friends',
 'smoothed_wrestaurant_1d',
 'smoothed_wvaccine_likely_politicians',
 'smoothed_wvaccine_likely_who',
 'smoothed_wwearing_mask',
 'smoothed_wlarge_event_1d',
 'State']

In [15]:
df['smoothed_wworried_become_ill_x_smoothed_wwork_outside_home_1d'] = df['smoothed_wworried_become_ill'] * df['smoothed_wwork_outside_home_1d'] / 100
df['smoothed_wworried_become_ill_x_smoothed_wspent_time_1d'] = df['smoothed_wworried_become_ill'] * df['smoothed_wspent_time_1d'] / 100
df['smoothed_wcli_lag_3_x_smoothed_wwork_outside_home_1d'] = df['smoothed_wcli_lag_3'] * df['smoothed_wwork_outside_home_1d'] / 100
df['smoothed_wothers_masked_x_smoothed_wcovid_vaccinated_or_accept'] = df['smoothed_wothers_masked'] * df['smoothed_wcovid_vaccinated_or_accept'] / 100
df['smoothed_wtested_14d_x_smoothed_wvaccine_likely_govt_health'] = df['smoothed_wtested_14d'] * df['smoothed_wvaccine_likely_govt_health'] / 100
df['smoothed_wcli_x_smoothed_wcovid_vaccinated_or_accept'] = df['smoothed_wcli'] * df['smoothed_wcovid_vaccinated_or_accept'] / 100
df['smoothed_wcovid_vaccinated_or_accept_x_smoothed_wvaccine_likely_friends'] = df['smoothed_wcovid_vaccinated_or_accept'] * df['smoothed_wvaccine_likely_friends'] / 100

In [16]:
df.describe().transpose().head(50)

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
time_value,1560.0,2021-02-10 12:21:13.846153984,2021-02-09 00:00:00,2021-02-10 00:00:00,2021-02-11 00:00:00,2021-02-12 00:00:00,2021-02-12 00:00:00,
smoothed_wspent_time_1d,1560.0,34.784198,18.409056,30.932344,34.791486,38.578333,53.434564,5.811004
smoothed_wtested_14d,1560.0,11.717631,2.963201,8.710662,11.051046,13.748114,30.10774,4.386315
smoothed_wpublic_transit_1d,1560.0,3.516378,0.123266,1.803839,2.77625,4.080309,36.015469,3.704213
smoothed_wcovid_vaccinated_or_accept,1560.0,77.0659,55.836485,72.657391,77.579045,81.857238,95.95905,6.795029
smoothed_wworried_become_ill,1560.0,67.731834,47.633502,64.297448,67.882948,71.334547,87.267787,5.577445
smoothed_wvaccine_likely_govt_health,1560.0,30.339842,11.856773,25.464736,29.848554,34.948728,52.23676,6.649011
smoothed_wshop_1d,1560.0,58.145188,43.851219,54.878149,58.076137,61.446575,73.849685,4.848075
smoothed_wwork_outside_home_1d,1560.0,37.990156,19.651924,34.216912,37.985658,41.750466,58.76334,5.736786
smoothed_wothers_masked,1560.0,83.303633,22.668564,78.100391,87.046647,91.591692,98.570748,11.627496


## Time Trend
A systematic change over time that affects all entities in the same way. Time trend captures common evolution over time

In [17]:
# Day of Week
day_of_week_dummies = pd.get_dummies(df['day_of_week'], prefix='week_no')
day_of_week_dummies = day_of_week_dummies.astype(int)
day_of_week_dummies['dummy_weekend'] = day_of_week_dummies['week_no_Saturday'] + day_of_week_dummies['week_no_Sunday']
day_of_week_dummies

Unnamed: 0,week_no_Monday,week_no_Tuesday,week_no_Wednesday,week_no_Thursday,week_no_Friday,week_no_Saturday,week_no_Sunday,dummy_weekend
0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
1555,0,0,0,0,1,0,0,0
1556,0,1,0,0,0,0,0,0
1557,0,0,1,0,0,0,0,0
1558,0,0,0,1,0,0,0,0


In [18]:
df = pd.concat([df, day_of_week_dummies[['dummy_weekend']]], axis=1)
display(df.head())
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,smoothed_wwearing_mask_rolling_mean_3,smoothed_wlarge_event_1d_rolling_mean_3,smoothed_wworried_become_ill_x_smoothed_wwork_outside_home_1d,smoothed_wworried_become_ill_x_smoothed_wspent_time_1d,smoothed_wcli_lag_3_x_smoothed_wwork_outside_home_1d,smoothed_wothers_masked_x_smoothed_wcovid_vaccinated_or_accept,smoothed_wtested_14d_x_smoothed_wvaccine_likely_govt_health,smoothed_wcli_x_smoothed_wcovid_vaccinated_or_accept,smoothed_wcovid_vaccinated_or_accept_x_smoothed_wvaccine_likely_friends,dummy_weekend
0,2021-02-09,10001,36.782852,15.77906,7.452131,71.744309,63.479295,31.768168,60.557307,40.272843,...,94.930338,6.512559,25.564917,23.349495,0.212319,61.307108,5.012718,0.358206,27.864196,0
1,2021-02-10,10001,33.319952,12.837715,2.041177,72.171535,65.314007,28.52832,63.845535,38.435611,...,94.914101,5.810141,25.103838,21.762596,0.210653,65.203912,3.662384,0.38512,24.972099,0
2,2021-02-11,10001,36.710308,14.203521,2.125673,68.555617,64.170146,31.0201,64.243291,39.718812,...,94.546291,5.424958,25.487619,23.557058,0.207154,61.130525,4.405946,0.67381,24.459031,0
3,2021-02-12,10001,36.219647,15.232744,1.477105,72.127871,60.310761,32.0248,66.073791,36.354908,...,93.910664,5.030087,21.925922,21.844344,0.181513,65.709163,4.878256,0.719424,24.207993,0
4,2021-02-09,10003,35.4718,17.469862,2.019208,82.620697,70.510884,33.113208,57.594607,31.909814,...,95.003618,6.782084,22.499892,25.011479,0.190715,77.058248,5.784832,0.559209,35.154173,0


Shape:  (1560, 62)


## Fixed Effects
Fixed effects control for entity-specific characteristics that don’t change over time but may influence the dependent variable.

In [19]:
# Stage Number
stage_dummies = pd.get_dummies(df['State'], prefix='dummy')
stage_dummies = stage_dummies.drop(columns = {"dummy_Hawaii"})
stage_dummies['dummy_Wyoming'] = 0
stage_dummies = stage_dummies.astype(int)
df = pd.concat([df, stage_dummies], axis=1)
display(df.head())
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,dummy_South Dakota,dummy_Tennessee,dummy_Texas,dummy_Utah,dummy_Vermont,dummy_Virginia,dummy_Washington,dummy_West Virginia,dummy_Wisconsin,dummy_Wyoming
0,2021-02-09,10001,36.782852,15.77906,7.452131,71.744309,63.479295,31.768168,60.557307,40.272843,...,0,0,0,0,0,0,0,0,0,0
1,2021-02-10,10001,33.319952,12.837715,2.041177,72.171535,65.314007,28.52832,63.845535,38.435611,...,0,0,0,0,0,0,0,0,0,0
2,2021-02-11,10001,36.710308,14.203521,2.125673,68.555617,64.170146,31.0201,64.243291,39.718812,...,0,0,0,0,0,0,0,0,0,0
3,2021-02-12,10001,36.219647,15.232744,1.477105,72.127871,60.310761,32.0248,66.073791,36.354908,...,0,0,0,0,0,0,0,0,0,0
4,2021-02-09,10003,35.4718,17.469862,2.019208,82.620697,70.510884,33.113208,57.594607,31.909814,...,0,0,0,0,0,0,0,0,0,0


Shape:  (1560, 105)


In [20]:
df.columns = df.columns.str.strip().str.replace(' ', '_')
df.columns = df.columns.str.lower()

display(df.head())
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,dummy_south_dakota,dummy_tennessee,dummy_texas,dummy_utah,dummy_vermont,dummy_virginia,dummy_washington,dummy_west_virginia,dummy_wisconsin,dummy_wyoming
0,2021-02-09,10001,36.782852,15.77906,7.452131,71.744309,63.479295,31.768168,60.557307,40.272843,...,0,0,0,0,0,0,0,0,0,0
1,2021-02-10,10001,33.319952,12.837715,2.041177,72.171535,65.314007,28.52832,63.845535,38.435611,...,0,0,0,0,0,0,0,0,0,0
2,2021-02-11,10001,36.710308,14.203521,2.125673,68.555617,64.170146,31.0201,64.243291,39.718812,...,0,0,0,0,0,0,0,0,0,0
3,2021-02-12,10001,36.219647,15.232744,1.477105,72.127871,60.310761,32.0248,66.073791,36.354908,...,0,0,0,0,0,0,0,0,0,0
4,2021-02-09,10003,35.4718,17.469862,2.019208,82.620697,70.510884,33.113208,57.594607,31.909814,...,0,0,0,0,0,0,0,0,0,0


Shape:  (1560, 105)


In [21]:
df_null = pd.DataFrame(df.isnull().sum(), columns = ['NULL'])
df_null[df_null['NULL']>0]

Unnamed: 0,NULL


In [22]:
df.to_pickle("feature_engineering_test_dataset.pkl")