# DATA PREPARATION

### IMPORT LIBRARIES

In [6]:
import pandas as pd
import numpy as np
import psycopg2 as pg
import matplotlib.pyplot as plt

### POSTGRES CONNECTION

In [7]:
conn = pg.connect("dbname=postgres user=postgres password=saude123")

### IMPORT DATA

In [8]:
## Doses
sql_query_doses = pd.read_sql_query ('''
select 
date,
state, 
sum(doses) as doses 
from view_vacc
where (vaccine_dose = 'D1' or vaccine_dose = 'D_unica') 
and state!= 'XX' and state != '' and age_group not in ('0-4','5-9','15-19')
group by date, state
''', conn)
sql_query_doses['date'] = pd.to_datetime(sql_query_doses['date']) 

## Deaths
sql_query_deaths = pd.read_sql_query ('''
select * from view_deaths_old
''', conn)
sql_query_deaths['date'] = pd.to_datetime(sql_query_deaths['date'])

## Population
sql_query_pop = pd.read_sql_query ('''
select 
state,
sum(population) as pop from population_city_2020  
group by state
''', conn)

## Oxford Measures
sql_query_str_index = pd.read_sql_query('''
select 
state,
date, 
stringency_index 
from oxford_measures
where state is not null and stringency_index is not null and jurisdiction = 'STATE_GOV'
order by state,date
''',conn)
sql_query_str_index['date'] = pd.to_datetime(sql_query_str_index['date'])

## Google Mobility Report
sql_query_goog_mob = pd.read_sql_query('''
select 
date,
state, 
retail_and_recreation,
grocery_and_pharmacy,
parks,
workplaces,
residential
from google_mobility_report
where country = 'Brazil' and
sub_region is null and
state is not null
''',conn)
sql_query_goog_mob['date'] = pd.to_datetime(sql_query_goog_mob['date'])
sql_query_goog_mob.iloc[:,3:8] = sql_query_goog_mob.iloc[:,3:8].applymap(float)
goog_cols = ['retail_and_recreation','grocery_and_pharmacy','parks','workplaces','residential']

## Socioeconomic
sql_query_socioeconomic = pd.read_sql_query('''
select 
state,
hdi_2010 as hdi,
demographic_density 
from socioeconomic_states
''',conn)

## States
states = pd.read_sql_query('''
select state from brazil_states
''',conn)

## States Dictionay
dict_states_socioeconomic = {
    'Acre':'AC',
    'Amazonas':'AM',
    'Alagoas':'AL',
    'Amapá':'AP',
    'Bahia':'BA',
    'Ceará':'CE',
    'Distrito Federal':'DF',
    'Espírito Santo':'ES',
    'Goiás':'GO',
    'Maranhão':'MA',
    'Mato Grosso':'MT',
    'Mato Grosso do Sul':'MS',
    'Minas Gerais':'MG',
    'Pará':'PA',
    'Paraíba':'PB',
    'Paraná':'PR',
    'Pernambuco':'PE',
    'Piauí':'PI',
    'Rio de Janeiro':'RJ',
    'Rio Grande do Norte':'RN',
    'Rio Grande do Sul':'RS',
    'Rondônia':'RO',
    'Roraima':'RR',
    'Santa Catarina':'SC',
    'São Paulo':'SP',
    'Sergipe':'SE',
    'Tocantins':'TO'
}
sql_query_socioeconomic['state'] = sql_query_socioeconomic['state'].map(dict_states_socioeconomic) 


cluster_state = {'AC': 1,
 'AL': 0,
 'AM': 1,
 'AP': 1,
 'BA': 0,
 'CE': 0,
 'DF': 0,
 'ES': 0,
 'GO': 0,
 'MA': 1,
 'MG': 0,
 'MS': 2,
 'MT': 0,
 'PA': 1,
 'PB': 0,
 'PE': 0,
 'PI': 0,
 'PR': 0,
 'RJ': 0,
 'RN': 0,
 'RO': 0,
 'RR': 1,
 'RS': 0,
 'SC': 0,
 'SE': 0,
 'SP': 2,
 'TO': 0}


### JOIN DATA

In [9]:
datas = pd.DataFrame(pd.date_range(start = '2021-01-01', end = '2021-12-31'),columns = ['date'])
df = pd.DataFrame({'state': np.array([states['state'],]*len(datas['date'])).flatten(),'date':datas['date'].repeat(len(states['state']))}).reset_index(drop = True)
df['date'] = pd.to_datetime(df['date'])

df = df.merge(sql_query_doses,how ='left',on= ['date','state'])
df = df.merge(sql_query_deaths,how='left',on=['date','state'])
df = df.merge(sql_query_pop, how = 'left', on = ['state'])
df = df.merge(sql_query_str_index,how='left',on=['date','state'])
df = df.merge(sql_query_goog_mob,how='left',on=['date','state']) ## Problema!
df = df.merge(sql_query_socioeconomic,how='left',on=['state'])
df['cluster'] = df['state'].apply(lambda x: cluster_state[x])
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by=['state','date']).reset_index(drop = True)
df[['doses','deaths']] = df[['doses','deaths']].fillna(0)
df


Unnamed: 0,state,date,doses,deaths,pop,stringency_index,retail_and_recreation,grocery_and_pharmacy,parks,workplaces,residential,hdi,demographic_density,cluster
0,AC,2021-01-01,0.0,2.0,894470,65.74,-60.0,-39.0,11.0,-63.0,16.0,0.663,4.5,1
1,AC,2021-01-02,0.0,4.0,894470,65.74,-25.0,19.0,12.0,-1.0,11.0,0.663,4.5,1
2,AC,2021-01-03,0.0,3.0,894470,65.74,-26.0,13.0,15.0,5.0,10.0,0.663,4.5,1
3,AC,2021-01-04,0.0,1.0,894470,65.74,-11.0,33.0,4.0,-3.0,9.0,0.663,4.5,1
4,AC,2021-01-05,0.0,6.0,894470,65.74,-9.0,31.0,-2.0,-2.0,9.0,0.663,4.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9850,TO,2021-12-27,265.0,0.0,1590248,19.44,21.0,55.0,92.0,-7.0,2.0,0.699,5.0,0
9851,TO,2021-12-28,332.0,0.0,1590248,19.44,11.0,43.0,39.0,-8.0,4.0,0.699,5.0,0
9852,TO,2021-12-29,379.0,0.0,1590248,19.44,24.0,57.0,75.0,-5.0,3.0,0.699,5.0,0
9853,TO,2021-12-30,244.0,0.0,1590248,19.44,30.0,65.0,87.0,-7.0,2.0,0.699,5.0,0


### MISSING VALUES ADJUSTMENTS

In [10]:
df['stringency_index'].interpolate(inplace = True)

### DATA MANAGEMENT

In [11]:
df_dict_state = dict.fromkeys(df['state'].unique())
for state in df_dict_state:
    
    # Data as index
    iteration = df[df['state'] == state].set_index('date', drop = True).drop('state',axis = 1)
    
    # Deaths
    iteration['deaths_100k'] = 100000 * iteration['deaths'] / iteration['pop']
    iteration['deaths_100k_mm7'] = iteration['deaths_100k'].rolling(7).mean()
    iteration['deaths_100k_mm14'] = iteration['deaths_100k'].rolling(14).mean()
    
    # Doses
    iteration['cob_vacinal'] = iteration['doses'].cumsum() / iteration['pop']
    iteration['doses_100k'] = 100000 * iteration['doses']/iteration['pop']
    iteration['doses_100k_mm7'] = iteration['doses_100k'].rolling(7).mean()
    iteration['doses_100k_mm14'] = iteration['doses_100k'].rolling(14).mean()
    
    # Stringency Index
    iteration['stringency_index_mm14'] = iteration['stringency_index'].rolling(14).mean()
    
    # Google Mobility
    for col in goog_cols:
        iteration[col+'_mm14'] = iteration[col].rolling(14).mean() 
    df_dict_state[state] = iteration

### EXPORT DATA

In [12]:
for state in states['state']:
    df_dict_state[state].to_csv('data_preparation_csv\%s.csv'%state)