In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline



df = pd.read_pickle(r'pickles/df3.pkl')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10152 entries, 0 to 10151
Data columns (total 51 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Country/Region                  10152 non-null  object        
 1   Lat                             10152 non-null  float64       
 2   Long                            10152 non-null  float64       
 3   Date                            10152 non-null  datetime64[ns]
 4   Continent                       10152 non-null  object        
 5   Region                          10152 non-null  object        
 6   Continent_Asia                  10152 non-null  bool          
 7   Region_encoded                  10152 non-null  float64       
 8   Running_Days                    10152 non-null  int64         
 9   Active/100 Cases                10152 non-null  float64       
 10  Recovered/100 Cases             10152 non-null  float64       
 11  Ne

In [3]:
df.columns

Index(['Country/Region', 'Lat', 'Long', 'Date', 'Continent', 'Region',
       'Continent_Asia', 'Region_encoded', 'Running_Days', 'Active/100 Cases',
       'Recovered/100 Cases', 'New Deaths', 'New Active', 'New Recovered', 'r',
       'new_tests', 'Grocery and pharmacy', 'Parks', 'Residential',
       'Retail and recreation', 'Transit stations', 'Workplaces',
       'School_Closing', 'Workplace_Closing', 'Public_Events_Canceled',
       'Gathering_Restrictions', 'Public_Transport_Closed', 'Stay_at_Home',
       'Internal_Movement_Restrictions', 'International_Travel_Controls',
       'Income_Support', 'Debt_Relief', 'Public_Info_Campaigns',
       'Testing_Policy', 'Contact_Tracing', 'Mask_Mandate', 'Pop_Group',
       'Pop_Group_encoded', 'Pop.Density', 'Density_Group',
       'Density_Group_encoded', '% aging.pop', '% young.pop', '% healthcare',
       'GDP_per_capita', '% smoking', 'total_tests_per_100',
       'new_tests_per_100', 'Pandemic_Status_Worldwide',
       'Pandemic_Sta

In [4]:
lags = ['Workplace_Closing', 'Internal_Movement_Restrictions', 'Stay_at_Home',
        'School_Closing', 'Gathering_Restrictions', 'Public_Events_Canceled',
        'Public_Transport_Closed', 'Mask_Mandate', 'International_Travel_Controls']

for col in lags:
    df[f'{col}_lag30'] = df.groupby('Country/Region')[col].shift(30)

In [5]:
corr_now = df[lags].corrwith(df['New Deaths'])
corr_lag30 = df[[f'{col}_lag30' for col in lags]].corrwith(df['New Deaths'])

corr_comparison = pd.DataFrame({
    'Immediate': corr_now,
    'Lag30': corr_lag30
}).sort_values('Immediate', ascending=False)

print(corr_comparison.round(3))


                                      Immediate  Lag30
Workplace_Closing                         0.255    NaN
Internal_Movement_Restrictions            0.220    NaN
Stay_at_Home                              0.211    NaN
School_Closing                            0.192    NaN
Gathering_Restrictions                    0.186    NaN
Public_Events_Canceled                    0.172    NaN
Public_Transport_Closed                   0.093    NaN
Mask_Mandate                              0.042    NaN
International_Travel_Controls             0.040    NaN
Gathering_Restrictions_lag30                NaN  0.048
Internal_Movement_Restrictions_lag30        NaN  0.076
International_Travel_Controls_lag30         NaN -0.064
Mask_Mandate_lag30                          NaN -0.034
Public_Events_Canceled_lag30                NaN  0.058
Public_Transport_Closed_lag30               NaN -0.025
School_Closing_lag30                        NaN  0.080
Stay_at_Home_lag30                          NaN  0.074
Workplace_

In [6]:
#add 14 day lag to policies 

policy_cols = [
    'School_Closing', 'Workplace_Closing', 'Public_Events_Canceled',
    'Gathering_Restrictions', 'Public_Transport_Closed', 'Stay_at_Home',
    'Internal_Movement_Restrictions', 'International_Travel_Controls', 'Mask_Mandate'
]

# Apply lag and cap NaNs
lag_days = 14  # or 30 if you want a longer delay

for col in policy_cols:
    lag_col = f'{col}_lag{lag_days}'
    df[lag_col] = (
        df.groupby('Country/Region')[col]
          .shift(lag_days)
          .bfill()          # fills first NaNs (start of each country)
          .ffill()          # fills last NaNs (end of each country)
    )


In [7]:
#add 30 day lag to policies 

# Apply lag and cap NaNs
lag_days = 30 

for col in policy_cols:
    lag_col = f'{col}_lag{lag_days}'
    df[lag_col] = (
        df.groupby('Country/Region')[col]
          .shift(lag_days)
          .bfill()          # fills first NaNs (start of each country)
          .ffill()          # fills last NaNs (end of each country)
    )


In [8]:
#add 14 day lag to mobility 

mobility_cols = [
    'Grocery and pharmacy', 'Parks', 'Residential',
    'Retail and recreation', 'Transit stations', 'Workplaces'
]

lag_days = 14  # or 7 for short-term behavior

for col in mobility_cols:
    lag_col = f'{col}_lag{lag_days}'
    df[lag_col] = (
        df.groupby('Country/Region')[col]
          .shift(lag_days)
          .bfill()     # fill first NaNs (early period)
          .ffill()     # fill last NaNs (end period)
    )

In [9]:
#add 30 day lag to mobility 
# Apply lag and cap NaNs
lag_days = 30 

for col in mobility_cols:
    lag_col = f'{col}_lag{lag_days}'
    df[lag_col] = (
        df.groupby('Country/Region')[col]
          .shift(lag_days)
          .bfill()          # fills first NaNs (start of each country)
          .ffill()          # fills last NaNs (end of each country)
    )


In [10]:
#dropping original columns
df = df.drop(columns=[c for c in policy_cols if c in df.columns])
df = df.drop(columns=[c for c in mobility_cols if c in df.columns])

In [11]:
df.columns

Index(['Country/Region', 'Lat', 'Long', 'Date', 'Continent', 'Region',
       'Continent_Asia', 'Region_encoded', 'Running_Days', 'Active/100 Cases',
       'Recovered/100 Cases', 'New Deaths', 'New Active', 'New Recovered', 'r',
       'new_tests', 'Income_Support', 'Debt_Relief', 'Public_Info_Campaigns',
       'Testing_Policy', 'Contact_Tracing', 'Pop_Group', 'Pop_Group_encoded',
       'Pop.Density', 'Density_Group', 'Density_Group_encoded', '% aging.pop',
       '% young.pop', '% healthcare', 'GDP_per_capita', '% smoking',
       'total_tests_per_100', 'new_tests_per_100', 'Pandemic_Status_Worldwide',
       'Pandemic_Status_Code', 'log_GDP_billions', 'Workplace_Closing_lag30',
       'Internal_Movement_Restrictions_lag30', 'Stay_at_Home_lag30',
       'School_Closing_lag30', 'Gathering_Restrictions_lag30',
       'Public_Events_Canceled_lag30', 'Public_Transport_Closed_lag30',
       'Mask_Mandate_lag30', 'International_Travel_Controls_lag30',
       'School_Closing_lag14', 'Work

In [12]:
#interaction between features 

df['Econ_Age_Interaction'] = df['log_GDP_billions'] * df['% aging.pop']
df['Wealth_Health_Interaction'] = df['GDP_per_capita'] * df['% healthcare']


In [13]:
#create geo-cluster

from sklearn.cluster import KMeans
df['GeoCluster'] = KMeans(n_clusters=5, random_state=42).fit_predict(df[['Lat', 'Long']])

In [14]:
df = df.drop(columns=['Lat','Long','Running_Days'],errors='ignore')

df = df.select_dtypes(include=['number'])

In [15]:
df.columns

Index(['Region_encoded', 'Active/100 Cases', 'Recovered/100 Cases',
       'New Deaths', 'New Active', 'New Recovered', 'r', 'new_tests',
       'Income_Support', 'Debt_Relief', 'Public_Info_Campaigns',
       'Testing_Policy', 'Contact_Tracing', 'Pop_Group_encoded', 'Pop.Density',
       'Density_Group_encoded', '% aging.pop', '% young.pop', '% healthcare',
       'GDP_per_capita', '% smoking', 'total_tests_per_100',
       'new_tests_per_100', 'Pandemic_Status_Code', 'log_GDP_billions',
       'Workplace_Closing_lag30', 'Internal_Movement_Restrictions_lag30',
       'Stay_at_Home_lag30', 'School_Closing_lag30',
       'Gathering_Restrictions_lag30', 'Public_Events_Canceled_lag30',
       'Public_Transport_Closed_lag30', 'Mask_Mandate_lag30',
       'International_Travel_Controls_lag30', 'School_Closing_lag14',
       'Workplace_Closing_lag14', 'Public_Events_Canceled_lag14',
       'Gathering_Restrictions_lag14', 'Public_Transport_Closed_lag14',
       'Stay_at_Home_lag14', 'Internal

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVR
from sklearn.linear_model import Ridge


y=df['New Deaths']
X = df.drop(columns=['New Deaths'])


# Fit models and determine if a feature is selected (1) or not (0)
lasso = Lasso(alpha=5).fit(X, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

# Fit Ridge model
ridge = Ridge(alpha=5).fit(X, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

gb = GradientBoostingRegressor().fit(X, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)

rf = RandomForestRegressor().fit(X, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)

# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected, 
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'GradientBoost', 'RandomForest','Ridge']].sum(axis=1)




In [17]:
selection_df

Unnamed: 0,Feature,Lasso,GradientBoost,RandomForest,Ridge,Sum
0,Region_encoded,1,1,1,1,4
1,Active/100 Cases,1,1,1,1,4
2,Recovered/100 Cases,1,1,1,1,4
3,New Active,1,1,1,1,4
4,New Recovered,1,1,1,1,4
5,r,0,1,1,1,3
6,new_tests,1,1,1,1,4
7,Income_Support,0,1,1,1,3
8,Debt_Relief,0,0,1,1,2
9,Public_Info_Campaigns,0,0,1,1,2


In [18]:
#drop less impactful lag cols 


cols_to_drop = [
    "Gathering_Restrictions_lag14",
    "Grocery_and_pharmacy_lag14",
    "Internal_Movement_Restrictions_lag14",
    "International_Travel_Controls_lag14",
    "Mask_Mandate_lag14",
    "Parks_lag14",
    "Public_Events_Canceled_lag14",
    "Public_Transport_Closed_lag14",
    "Residential_lag14",
    "School_Closing_lag14",
    "Stay_at_Home_lag14",
    "Transit_stations_lag14",
    "Workplace_Closing_lag14",
    "Workplaces_lag14",
    "Retail_and_recreation_lag30"
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

In [19]:
df.to_pickle(r'pickles/df4.pkl')