In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_pickle("cleaned_validation_dataset.pkl")

display(df.head(3))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,smoothed_wcovid_vaccinated,smoothed_wvaccine_likely_friends,smoothed_wrestaurant_1d,smoothed_wvaccine_likely_politicians,smoothed_wvaccine_likely_who,smoothed_wwearing_mask,smoothed_wlarge_event_1d,State,County Name,day_of_week
0,2021-01-30,10001,32.058222,16.388572,2.760288,71.190606,67.951268,27.628605,61.100831,39.865526,...,22.32368,29.24765,11.841485,8.860422,30.097123,96.330866,10.431381,Delaware,Kent,Saturday
1,2021-01-31,10001,33.567386,16.823052,3.810319,68.516498,65.308174,23.597192,66.3005,43.130462,...,22.165295,27.032093,13.656526,7.257635,27.500973,95.13535,11.917033,Delaware,Kent,Sunday
2,2021-02-01,10001,33.480379,15.760475,3.721566,68.230689,67.15467,22.022345,64.292646,41.93813,...,21.96303,28.21977,12.454998,5.64563,27.786762,94.550634,10.582422,Delaware,Kent,Monday


Shape:  (3010, 22)


In [3]:
df.isnull().sum()

time_value                              0
geo_value                               0
smoothed_wspent_time_1d                 0
smoothed_wtested_14d                    0
smoothed_wpublic_transit_1d             0
smoothed_wcovid_vaccinated_or_accept    0
smoothed_wworried_become_ill            0
smoothed_wvaccine_likely_govt_health    0
smoothed_wshop_1d                       0
smoothed_wwork_outside_home_1d          0
smoothed_wothers_masked                 0
smoothed_wcli                           0
smoothed_wcovid_vaccinated              0
smoothed_wvaccine_likely_friends        0
smoothed_wrestaurant_1d                 0
smoothed_wvaccine_likely_politicians    0
smoothed_wvaccine_likely_who            0
smoothed_wwearing_mask                  0
smoothed_wlarge_event_1d                0
State                                   0
County Name                             0
day_of_week                             0
dtype: int64

In [4]:
list_transform_features = [a_col for a_col in df.columns if a_col not in ['time_value','geo_value','State','County Name','day_of_week','smoothed_wtested_14d','smoothed_wtested_positive_14d']]
list_transform_features

['smoothed_wspent_time_1d',
 'smoothed_wpublic_transit_1d',
 'smoothed_wcovid_vaccinated_or_accept',
 'smoothed_wworried_become_ill',
 'smoothed_wvaccine_likely_govt_health',
 'smoothed_wshop_1d',
 'smoothed_wwork_outside_home_1d',
 'smoothed_wothers_masked',
 'smoothed_wcli',
 'smoothed_wcovid_vaccinated',
 'smoothed_wvaccine_likely_friends',
 'smoothed_wrestaurant_1d',
 'smoothed_wvaccine_likely_politicians',
 'smoothed_wvaccine_likely_who',
 'smoothed_wwearing_mask',
 'smoothed_wlarge_event_1d']

##  Entity-Level Dynamic
how past values of a variable for the same entity affect its current value. It's about dynamic behavior within an individual over time.

Is it ok if each geo_value have only 14 records:
- 7,000+ rows = enough to train LightGBM and avoid overfitting.
- Lag features are crucial for time-aware behavior (e.g., "cases rise 7 days after mobility increases").
- Losing a few days of data per geo is acceptable if lag features significantly improve model performance.

In [5]:
# Lagged of smoothed_wcovid_vaccinated
# Assumption: If a county had high vaccination last week, it's likely to have similar or increasing levels this week.
# lag_smoothed_wcovid_vaccinated_3: capture previous 3 days vaccination level (short run effect)
# lag_smoothed_wcovid_vaccinated_7: capture previous last week's vaccination level (weekly effect)
# smoothed_wtested_14d don't lag becaues it cover last 14 days
# smoothed_wtested_positive_14d don't lag becaues it cover last 14 days
for a_col in list_transform_features:
    df['%s_lag_3'%a_col] = df.groupby('geo_value')['%s'%a_col].shift(3)

In [6]:
display(df[['time_value','geo_value','smoothed_wcovid_vaccinated','smoothed_wcovid_vaccinated_lag_3']].head(43))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wcovid_vaccinated,smoothed_wcovid_vaccinated_lag_3
0,2021-01-30,10001,22.32368,
1,2021-01-31,10001,22.165295,
2,2021-02-01,10001,21.96303,
3,2021-02-02,10001,19.062903,22.32368
4,2021-02-03,10001,19.25492,22.165295
5,2021-02-04,10001,20.664035,21.96303
6,2021-02-05,10001,20.781314,19.062903
7,2021-01-30,10003,17.100805,
8,2021-01-31,10003,17.166713,
9,2021-02-01,10003,17.027711,


Shape:  (3010, 38)


In [7]:
rolling_features = {}
list_select_period = [3]

for a_col in list_transform_features:
    for a_lag in list_select_period:
        roll_mean = df.groupby(['geo_value'])[a_col].rolling(window=a_lag).mean().reset_index(level=0, drop=True)
        mean_col_name = f'{a_col}_rolling_mean_{a_lag}'
        rolling_features[mean_col_name] = roll_mean

df_rolling = pd.DataFrame(rolling_features)
df = pd.concat([df, df_rolling], axis=1)

In [8]:
display(df[['time_value','geo_value','smoothed_wcovid_vaccinated','smoothed_wcovid_vaccinated_rolling_mean_3']].head(50))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wcovid_vaccinated,smoothed_wcovid_vaccinated_rolling_mean_3
0,2021-01-30,10001,22.32368,
1,2021-01-31,10001,22.165295,
2,2021-02-01,10001,21.96303,22.150669
3,2021-02-02,10001,19.062903,21.063743
4,2021-02-03,10001,19.25492,20.093618
5,2021-02-04,10001,20.664035,19.660619
6,2021-02-05,10001,20.781314,20.233423
7,2021-01-30,10003,17.100805,
8,2021-01-31,10003,17.166713,
9,2021-02-01,10003,17.027711,17.09841


Shape:  (3010, 54)


In [9]:
for a_col in list_transform_features:
    for a_lag in list_select_period:
        df[f'{a_col}_rolling_mean_{a_lag}'] = df.groupby('geo_value')[f'{a_col}_rolling_mean_{a_lag}'].shift(1)

In [10]:
display(df[['time_value','geo_value','smoothed_wcovid_vaccinated','smoothed_wcovid_vaccinated_rolling_mean_3']].head(50))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wcovid_vaccinated,smoothed_wcovid_vaccinated_rolling_mean_3
0,2021-01-30,10001,22.32368,
1,2021-01-31,10001,22.165295,
2,2021-02-01,10001,21.96303,
3,2021-02-02,10001,19.062903,22.150669
4,2021-02-03,10001,19.25492,21.063743
5,2021-02-04,10001,20.664035,20.093618
6,2021-02-05,10001,20.781314,19.660619
7,2021-01-30,10003,17.100805,
8,2021-01-31,10003,17.166713,
9,2021-02-01,10003,17.027711,


Shape:  (3010, 54)


In [11]:
df.isnull().sum()

time_value                                                0
geo_value                                                 0
smoothed_wspent_time_1d                                   0
smoothed_wtested_14d                                      0
smoothed_wpublic_transit_1d                               0
smoothed_wcovid_vaccinated_or_accept                      0
smoothed_wworried_become_ill                              0
smoothed_wvaccine_likely_govt_health                      0
smoothed_wshop_1d                                         0
smoothed_wwork_outside_home_1d                            0
smoothed_wothers_masked                                   0
smoothed_wcli                                             0
smoothed_wcovid_vaccinated                                0
smoothed_wvaccine_likely_friends                          0
smoothed_wrestaurant_1d                                   0
smoothed_wvaccine_likely_politicians                      0
smoothed_wvaccine_likely_who            

In [12]:
df = df.dropna().reset_index(drop = True)
display(df.head(50))
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,smoothed_wwork_outside_home_1d_rolling_mean_3,smoothed_wothers_masked_rolling_mean_3,smoothed_wcli_rolling_mean_3,smoothed_wcovid_vaccinated_rolling_mean_3,smoothed_wvaccine_likely_friends_rolling_mean_3,smoothed_wrestaurant_1d_rolling_mean_3,smoothed_wvaccine_likely_politicians_rolling_mean_3,smoothed_wvaccine_likely_who_rolling_mean_3,smoothed_wwearing_mask_rolling_mean_3,smoothed_wlarge_event_1d_rolling_mean_3
0,2021-02-02,10001,32.242002,16.735176,3.85493,65.747872,67.494766,22.045415,66.438259,44.750377,...,41.644706,90.283782,0.255327,22.150669,28.166504,12.651003,7.254562,28.461619,95.33895,10.976945
1,2021-02-03,10001,34.991057,18.797872,9.298927,66.752089,64.805847,26.106294,63.25537,46.075447,...,43.27299,90.918403,0.264376,21.063743,27.672652,12.84969,5.985934,26.774895,94.687157,10.666794
2,2021-02-04,10001,31.996321,17.816678,8.826546,69.706913,64.414916,26.76527,63.628774,46.781418,...,44.254652,89.937486,0.270912,20.093618,29.989594,14.90279,5.063887,27.230884,94.722001,9.678267
3,2021-02-05,10001,30.638051,19.438635,7.901019,70.114019,66.218037,27.915116,62.510757,45.409562,...,45.869081,88.845165,0.183833,19.660619,32.522062,16.816306,4.958974,28.400366,94.947658,8.991073
4,2021-02-02,10003,29.210699,20.370378,2.895976,77.869953,67.696199,29.839486,52.248428,29.559756,...,27.061699,92.795266,0.506853,17.09841,37.159171,15.311501,11.152152,35.768835,94.39516,7.642562
5,2021-02-03,10003,28.726196,19.998905,2.87307,77.672475,68.476692,30.428855,52.6157,31.106152,...,28.472463,92.19408,0.530944,17.886934,36.206419,16.447282,11.214867,34.845212,93.781766,8.587432
6,2021-02-04,10003,30.24494,19.18783,1.899779,77.266148,69.203247,30.599886,54.884948,32.576998,...,29.568901,91.51985,0.643006,18.746576,35.035754,16.978131,11.482784,34.920981,93.510275,9.374668
7,2021-02-05,10003,31.193276,20.456386,1.589712,77.888796,68.696654,29.837502,57.337879,34.393358,...,31.080969,90.75408,0.782976,19.953477,34.359486,16.221229,11.463284,34.972436,93.451651,9.356756
8,2021-02-02,10005,33.282415,15.577872,0.39463,74.450649,67.639927,28.004007,57.262724,32.653862,...,31.589723,94.110479,0.0,18.228793,33.191993,16.954128,9.535512,31.513179,96.5084,4.899812
9,2021-02-03,10005,31.769023,15.421043,0.637937,72.36359,69.743896,26.800111,56.113851,33.732888,...,32.129438,93.228423,0.0,17.351146,31.909731,17.13065,9.317893,30.499886,96.086238,6.030369


Shape:  (1717, 54)


In [13]:
df.groupby('geo_value').agg({"time_value":"nunique"})

Unnamed: 0_level_0,time_value
geo_value,Unnamed: 1_level_1
10001,4
10003,4
10005,4
11001,4
12001,4
...,...
55087,4
55101,4
55105,4
55133,4


# Interaction

In [14]:
list(df.columns)[:20]

['time_value',
 'geo_value',
 'smoothed_wspent_time_1d',
 'smoothed_wtested_14d',
 'smoothed_wpublic_transit_1d',
 'smoothed_wcovid_vaccinated_or_accept',
 'smoothed_wworried_become_ill',
 'smoothed_wvaccine_likely_govt_health',
 'smoothed_wshop_1d',
 'smoothed_wwork_outside_home_1d',
 'smoothed_wothers_masked',
 'smoothed_wcli',
 'smoothed_wcovid_vaccinated',
 'smoothed_wvaccine_likely_friends',
 'smoothed_wrestaurant_1d',
 'smoothed_wvaccine_likely_politicians',
 'smoothed_wvaccine_likely_who',
 'smoothed_wwearing_mask',
 'smoothed_wlarge_event_1d',
 'State']

In [15]:
df['smoothed_wworried_become_ill_x_smoothed_wwork_outside_home_1d'] = df['smoothed_wworried_become_ill'] * df['smoothed_wwork_outside_home_1d'] / 100
df['smoothed_wworried_become_ill_x_smoothed_wspent_time_1d'] = df['smoothed_wworried_become_ill'] * df['smoothed_wspent_time_1d'] / 100
df['smoothed_wcli_lag_3_x_smoothed_wwork_outside_home_1d'] = df['smoothed_wcli_lag_3'] * df['smoothed_wwork_outside_home_1d'] / 100
df['smoothed_wothers_masked_x_smoothed_wcovid_vaccinated_or_accept'] = df['smoothed_wothers_masked'] * df['smoothed_wcovid_vaccinated_or_accept'] / 100
df['smoothed_wtested_14d_x_smoothed_wvaccine_likely_govt_health'] = df['smoothed_wtested_14d'] * df['smoothed_wvaccine_likely_govt_health'] / 100
df['smoothed_wcli_x_smoothed_wcovid_vaccinated_or_accept'] = df['smoothed_wcli'] * df['smoothed_wcovid_vaccinated_or_accept'] / 100
df['smoothed_wcovid_vaccinated_or_accept_x_smoothed_wvaccine_likely_friends'] = df['smoothed_wcovid_vaccinated_or_accept'] * df['smoothed_wvaccine_likely_friends'] / 100

In [16]:
df.describe().transpose().head(50)

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
time_value,1717.0,2021-02-03 12:02:56.121141760,2021-02-02 00:00:00,2021-02-03 00:00:00,2021-02-04 00:00:00,2021-02-05 00:00:00,2021-02-05 00:00:00,
smoothed_wspent_time_1d,1717.0,34.156634,15.21429,30.224824,34.360044,38.106716,53.051133,6.255252
smoothed_wtested_14d,1717.0,12.556202,2.573159,9.094417,11.762334,15.115267,33.060047,4.72002
smoothed_wpublic_transit_1d,1717.0,3.364896,0.251187,1.883891,2.780641,3.851985,31.461403,3.243124
smoothed_wcovid_vaccinated_or_accept,1717.0,76.654972,52.33708,71.807178,76.750112,82.182663,98.239812,7.414277
smoothed_wworried_become_ill,1717.0,70.11341,44.594098,66.986829,70.607675,73.881839,88.718584,5.608826
smoothed_wvaccine_likely_govt_health,1717.0,31.220995,11.837245,26.371611,31.099432,35.645341,56.14228,6.925684
smoothed_wshop_1d,1717.0,57.548259,43.071511,53.959435,57.259993,61.097353,73.211214,5.08025
smoothed_wwork_outside_home_1d,1717.0,37.244307,14.668926,33.518848,37.270123,41.216937,54.813149,5.963809
smoothed_wothers_masked,1717.0,82.575609,28.627296,76.849522,86.246755,91.395972,98.572248,12.175381


## Time Trend
A systematic change over time that affects all entities in the same way. Time trend captures common evolution over time

In [17]:
# Day of Week
day_of_week_dummies = pd.get_dummies(df['day_of_week'], prefix='week_no')
day_of_week_dummies = day_of_week_dummies.astype(int)
day_of_week_dummies['dummy_weekend'] = day_of_week_dummies['week_no_Saturday'] + day_of_week_dummies['week_no_Sunday']
day_of_week_dummies

Unnamed: 0,week_no_Monday,week_no_Tuesday,week_no_Wednesday,week_no_Thursday,week_no_Friday,week_no_Saturday,week_no_Sunday,dummy_weekend
0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
1712,0,0,0,0,1,0,0,0
1713,0,1,0,0,0,0,0,0
1714,0,0,1,0,0,0,0,0
1715,0,0,0,1,0,0,0,0


In [18]:
df = pd.concat([df, day_of_week_dummies[['dummy_weekend']]], axis=1)
display(df.head())
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,smoothed_wwearing_mask_rolling_mean_3,smoothed_wlarge_event_1d_rolling_mean_3,smoothed_wworried_become_ill_x_smoothed_wwork_outside_home_1d,smoothed_wworried_become_ill_x_smoothed_wspent_time_1d,smoothed_wcli_lag_3_x_smoothed_wwork_outside_home_1d,smoothed_wothers_masked_x_smoothed_wcovid_vaccinated_or_accept,smoothed_wtested_14d_x_smoothed_wvaccine_likely_govt_health,smoothed_wcli_x_smoothed_wcovid_vaccinated_or_accept,smoothed_wcovid_vaccinated_or_accept_x_smoothed_wvaccine_likely_friends,dummy_weekend
0,2021-02-02,10001,32.242002,16.735176,3.85493,65.747872,67.494766,22.045415,66.438259,44.750377,...,95.33895,10.976945,30.204162,21.761664,0.111862,60.773376,3.689339,0.182196,18.255616,0
1,2021-02-03,10001,34.991057,18.797872,9.298927,66.752089,64.805847,26.106294,63.25537,46.075447,...,94.687157,10.666794,29.859584,22.676251,0.117389,58.203751,4.907428,0.183158,22.684308,0
2,2021-02-04,10001,31.996321,17.816678,8.826546,69.706913,64.414916,26.76527,63.628774,46.781418,...,94.722001,9.678267,30.134211,20.610403,0.122211,60.580613,4.768682,0.0,24.967046,0
3,2021-02-05,10001,30.638051,19.438635,7.901019,70.114019,66.218037,27.915116,62.510757,45.409562,...,94.947658,8.991073,30.069321,20.287916,0.125836,59.903998,5.426317,0.0,26.873534,0
4,2021-02-02,10003,29.210699,20.370378,2.895976,77.869953,67.696199,29.839486,52.248428,29.559756,...,94.39516,7.642562,20.010831,19.774533,0.161123,70.684312,6.078416,0.480729,27.21241,0


Shape:  (1717, 62)


## Fixed Effects
Fixed effects control for entity-specific characteristics that don’t change over time but may influence the dependent variable.

In [19]:
# Stage Number
stage_dummies = pd.get_dummies(df['State'], prefix='dummy')
stage_dummies = stage_dummies.drop(columns = {"dummy_Hawaii"})
stage_dummies['dummy_Wyoming'] = 0
stage_dummies = stage_dummies.astype(int)
df = pd.concat([df, stage_dummies], axis=1)
display(df.head())
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,dummy_South Dakota,dummy_Tennessee,dummy_Texas,dummy_Utah,dummy_Vermont,dummy_Virginia,dummy_Washington,dummy_West Virginia,dummy_Wisconsin,dummy_Wyoming
0,2021-02-02,10001,32.242002,16.735176,3.85493,65.747872,67.494766,22.045415,66.438259,44.750377,...,0,0,0,0,0,0,0,0,0,0
1,2021-02-03,10001,34.991057,18.797872,9.298927,66.752089,64.805847,26.106294,63.25537,46.075447,...,0,0,0,0,0,0,0,0,0,0
2,2021-02-04,10001,31.996321,17.816678,8.826546,69.706913,64.414916,26.76527,63.628774,46.781418,...,0,0,0,0,0,0,0,0,0,0
3,2021-02-05,10001,30.638051,19.438635,7.901019,70.114019,66.218037,27.915116,62.510757,45.409562,...,0,0,0,0,0,0,0,0,0,0
4,2021-02-02,10003,29.210699,20.370378,2.895976,77.869953,67.696199,29.839486,52.248428,29.559756,...,0,0,0,0,0,0,0,0,0,0


Shape:  (1717, 105)


In [20]:
df.columns = df.columns.str.strip().str.replace(' ', '_')
df.columns = df.columns.str.lower()

display(df.head())
print("Shape: ", df.shape)

Unnamed: 0,time_value,geo_value,smoothed_wspent_time_1d,smoothed_wtested_14d,smoothed_wpublic_transit_1d,smoothed_wcovid_vaccinated_or_accept,smoothed_wworried_become_ill,smoothed_wvaccine_likely_govt_health,smoothed_wshop_1d,smoothed_wwork_outside_home_1d,...,dummy_south_dakota,dummy_tennessee,dummy_texas,dummy_utah,dummy_vermont,dummy_virginia,dummy_washington,dummy_west_virginia,dummy_wisconsin,dummy_wyoming
0,2021-02-02,10001,32.242002,16.735176,3.85493,65.747872,67.494766,22.045415,66.438259,44.750377,...,0,0,0,0,0,0,0,0,0,0
1,2021-02-03,10001,34.991057,18.797872,9.298927,66.752089,64.805847,26.106294,63.25537,46.075447,...,0,0,0,0,0,0,0,0,0,0
2,2021-02-04,10001,31.996321,17.816678,8.826546,69.706913,64.414916,26.76527,63.628774,46.781418,...,0,0,0,0,0,0,0,0,0,0
3,2021-02-05,10001,30.638051,19.438635,7.901019,70.114019,66.218037,27.915116,62.510757,45.409562,...,0,0,0,0,0,0,0,0,0,0
4,2021-02-02,10003,29.210699,20.370378,2.895976,77.869953,67.696199,29.839486,52.248428,29.559756,...,0,0,0,0,0,0,0,0,0,0


Shape:  (1717, 105)


In [21]:
df_null = pd.DataFrame(df.isnull().sum(), columns = ['NULL'])
df_null[df_null['NULL']>0]

Unnamed: 0,NULL


In [22]:
df.to_pickle("feature_engineering_validation_dataset.pkl")