In [1]:
import pandas as pd

In [2]:
df_npi = pd.read_csv('covid_data/interventions_and_deaths.csv')
for id in df_npi.columns:
    print(id, end=', ')
df_npi

date, county, state_name, fips, cases, deaths, county_code, state, locality, locality_original, CPV, CPV_2, CPV_3, CPV_4, CPV_5, CPV_50, CPV_6, GS_10, GS_10_2, GS_10_3, GS_100, GS_1000, GS_25, GS_25_2, GS_250, GS_250_2, GS_50, GS_50_2, GS_500, LD, LD_2, NESC, NESC_2, NESC_3, PC, PC_2, PC_3, PC_4, PC_5, PC_6, SD, SD_2, SD_3, SDO, SDO_2, SDO_3, SDO_4,

Unnamed: 0,date,county,state_name,fips,cases,deaths,county_code,state,locality,locality_original,...,PC_4,PC_5,PC_6,SD,SD_2,SD_3,SDO,SDO_2,SDO_3,SDO_4
0,2020-03-01,Alameda,California,6001.0,1,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
1,2020-03-02,Alameda,California,6001.0,1,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
2,2020-03-03,Alameda,California,6001.0,2,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
3,2020-03-04,Alameda,California,6001.0,2,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
4,2020-03-05,Alameda,California,6001.0,2,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,2020-03-23,Snohomish,Washington,53061.0,518,10,61,WA,snohomish county,snohomish_county,...,1,1,0,1,0,0,1,1,1,0
896,2020-03-24,Snohomish,Washington,53061.0,613,15,61,WA,snohomish county,snohomish_county,...,1,1,0,1,0,0,1,1,1,0
897,2020-03-25,Snohomish,Washington,53061.0,633,15,61,WA,snohomish county,snohomish_county,...,1,1,0,1,0,0,1,1,1,0
898,2020-03-26,Snohomish,Washington,53061.0,778,18,61,WA,snohomish county,snohomish_county,...,1,1,0,1,0,0,1,1,1,0


## Clone this [link](https://github.com/khakieconomics/covid_data) in the same directory as this notebook
The NPIs defined are: • SDO - Social Distancing of particularly vulnerable portions of the population
- **SD** - Social Distancing of the general population
- **GS_XX** - Gathering size limitation, with the digits indicating the ceiling of acceptable gatherings. A government order indicating that gatherings above that size are prohibited
- **CPV** - Closure of Public Venues. A government order closing gathering venues for in-person service, such as restaurants, bars, and theaters
- **PC** - Closure of schools and universities
- **NESC** - Non-Essential Services Closure, i.e. a government order closing non-essential services and shops
- **LD** – Lock Down (pending)
### In df, single NPI can have multiple data sources, leading to `<intervention_type>_<source_id>`. For example, `CPV`, `CPV_1`,...,`CPV_6` are `CPV` status from `7` sources, but represent the same information. `CPV_50`, on the other hand, is a independent status. For more info, see [here](https://github.com/khakieconomics/covid_data/blob/master/data_munging.R). For simplicity, we treat each status as positive with affirmation at least one source. Summary of columns and actual information is summarized as:

In [3]:
# SD, SD_2, SD_3, SDO, SDO_2, SDO_3, SDO_4
# GS_10, GS_10_2, GS_10_3
# GS_25, GS_25_2
# GS_50, GS_50_2
# GS_100
# GS_250, GS_250_2
# GS_500
# GS_1000
# CPV, CPV_2, CPV_3, CPV_4, CPV_5, CPV_6
# CPV_50
# LD, LD_2
# NESC, NESC_2, NESC_3
# PC, PC_2, PC_3, PC_4, PC_5, PC_6
index_dict = {
    'SD': ['SD', 'SD_2', 'SD_3', 'SDO', 'SDO_2', 'SDO_3', 'SDO_4'],
    'GS_10': ['GS_10', 'GS_10_2', 'GS_10_3'],
    'GS_25': ['GS_25', 'GS_25_2'],
    'GS_50': ['GS_50', 'GS_50_2'],
    'GS_100': ['GS_100'],
    'GS_250': ['GS_250', 'GS_250_2'],
    'GS_500': ['GS_500'],
    'GS_1000': ['GS_1000'],
    'CPV': ['CPV', 'CPV_2', 'CPV_3', 'CPV_4', 'CPV_5', 'CPV_6'],
    'CPV_50': ['CPV_50'],
    'LD': ['LD', 'LD_2'],
    'NESC': ['NESC', 'NESC_2', 'NESC_3'],
    'PC': ['PC', 'PC_2', 'PC_3', 'PC_4', 'PC_5', 'PC_6']
}
NPI_types = list(index_dict.keys())
print(NPI_types)

['SD', 'GS_10', 'GS_25', 'GS_50', 'GS_100', 'GS_250', 'GS_500', 'GS_1000', 'CPV', 'CPV_50', 'LD', 'NESC', 'PC']


### Now remove redundancy

In [4]:
def merge_columns(df, name_final, names_to_merge):
    '''
    Input
        df: input df
        name_final: name of result column
        names_to_merge: list of columns
    Output
        None
    Inplace change:
        Columns names_to_merge are merged into name_final.
        Value is 1 if any of names_to_merge is 1, 0 otherwise.
    '''
    try:
        s = None
        for col in names_to_merge:
            if s is None:
                s = df[col]
            else:
                s += df[col]
        df.drop(columns=names_to_merge, inplace=True)
        df[name_final] = s.apply(lambda x: 1 if x > 0 else 0)
    except KeyError:
        print('{} already merged.'.format(names_to_merge))


#### 1. Merge columns

In [5]:
for npi in NPI_types:
    merge_columns(df_npi, npi, index_dict[npi])

In [6]:
for id in df_npi.columns:
    print(id, end=', ')

date, county, state_name, fips, cases, deaths, county_code, state, locality, locality_original, SD, GS_10, GS_25, GS_50, GS_100, GS_250, GS_500, GS_1000, CPV, CPV_50, LD, NESC, PC,

Unnamed: 0,date,county,state_name,fips,cases,deaths,county_code,state,locality,locality_original,...,GS_50,GS_100,GS_250,GS_500,GS_1000,CPV,CPV_50,LD,NESC,PC
0,2020-03-01,Alameda,California,6001.0,1,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
1,2020-03-02,Alameda,California,6001.0,1,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
2,2020-03-03,Alameda,California,6001.0,2,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
3,2020-03-04,Alameda,California,6001.0,2,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
4,2020-03-05,Alameda,California,6001.0,2,0,1,CA,alameda county,alameda_county,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,2020-03-23,Snohomish,Washington,53061.0,518,10,61,WA,snohomish county,snohomish_county,...,1,0,1,0,0,1,0,0,0,1
896,2020-03-24,Snohomish,Washington,53061.0,613,15,61,WA,snohomish county,snohomish_county,...,1,0,1,0,0,1,0,0,0,1
897,2020-03-25,Snohomish,Washington,53061.0,633,15,61,WA,snohomish county,snohomish_county,...,1,0,1,0,0,1,0,0,0,1
898,2020-03-26,Snohomish,Washington,53061.0,778,18,61,WA,snohomish county,snohomish_county,...,1,0,1,0,0,1,0,0,0,1


#### 2. Assign levels

In [8]:
# Non-zero levels for each NPI
# SD: 1

# GS_10 GS_25 GS_50 GS_100 GS_250 GS_500 GS_1000: 7,6,5,4,3,2,1
GS_levels = ['GS_10', 'GS_25', 'GS_50', 'GS_100', 'GS_250', 'GS_500', 'GS_1000']
def assign_GS(series):
    ret = 0 # if GS not implemented, return 0
    for i, level in enumerate(GS_levels[::-1]):
        if series[level] > 0:
            ret = i+1
    return ret

df_npi['GS'] = df_npi[GS_levels].apply(assign_GS, axis=1)
df_npi.drop(columns=GS_levels, inplace=True)

# CPV CPV_50: 2,1
df_npi.rename(columns={"CPV": "CPV_all"}, inplace=True)
CPV_levels = ['CPV_all', 'CPV_50']
def assign_CPV(series):
    ret = 0 # if CPV not implemented, return 0
    for i, level in enumerate(CPV_levels[::-1]):
        if series[level] > 0:
            ret = i+1
    return ret

df_npi['CPV'] = df_npi[CPV_levels].apply(assign_CPV, axis=1)
df_npi.drop(columns=CPV_levels, inplace=True)

# LD: 1
# NESC: 1
# PC: 1

In [9]:
# Save df with policy finely leveled
df_npi.to_csv('npi_multi_policy_level.csv')

### Segment series for each county depending on no-policy / any-policy

In [10]:
def extract_policy_start_day(df):
    df_copy = df.copy()
    df_copy['date'] = pd.to_datetime(df_copy['date'])
    df_copy = df_copy.sort_values(by="date")
    for i, row in df_copy.iterrows():
        if row['SD'] > 0 or row['LD'] > 0 or row['NESC'] > 0 or row['PC'] > 0 or row['GS'] > 0 or row['CPV'] > 0:
            return row['date']
    return None

In [11]:
df_county_start = df_npi.groupby('county').apply(extract_policy_start_day)
df_county_state = df_npi.groupby('county').apply(lambda x: x['state_name'].iloc[0])

In [12]:
df_county = pd.concat([df_county_state, df_county_start], axis=1)
df_county.columns = ['state_name', 'start_date']
df_county = df_county.reset_index()

In [14]:
# Save
df_county.to_csv('county_start_date.csv')

### Segment series for each state depending on no-policy / any-policy

In [15]:
df_state = df_county.groupby('state_name').apply(lambda x: x['start_date'].min())

In [17]:
df_state.to_csv('state_start_date.csv')

### For different policies published by different counties in same state, take the more serious one

In [35]:
def pick_highest_severity(df):
    policy_sum = df['SD']+df['LD']+df['NESC']+df['PC']+df['GS']+df['CPV']
    idxmax = policy_sum.idxmax(axis=0)
    return df.loc[idxmax, :]

In [36]:
df_npi_state = df_npi.groupby(['state_name', 'date']).apply(pick_highest_severity)

In [44]:
df_npi_state.reset_index(drop=True, inplace=True)

In [45]:
df_npi_state.to_csv('npi_multi_policy_level_unique_daily.csv')

### All combination start date

In [47]:
all_comb_start = df_npi_state.groupby(['state_name', 'SD', 'LD', 'NESC', 'PC', 'GS', 'CPV']).apply(lambda x: pd.to_datetime(x['date']).sort_values().iloc[0])

In [48]:
all_comb_start.to_csv('state_policy_start_date.csv')

In [49]:
all_comb_start

state_name  SD  LD  NESC  PC  GS  CPV
California  0   0   0     0   0   0     2020-01-26
            1   0   0     0   0   0     2020-03-05
                              1   0     2020-03-11
                              3   0     2020-03-12
                          1   4   0     2020-03-13
                                           ...    
Washington  0   0   0     0   0   0     2020-01-21
                          1   0   0     2020-03-02
            1   0   0     1   0   0     2020-03-05
                              3   0     2020-03-11
                    1     1   5   2     2020-03-16
Length: 65, dtype: datetime64[ns]