In [None]:
import pandas as pd
import numpy as np
import re   #for additional splitting methods. Used to remove the numbering on cities names
from simpledbf import Dbf5    #used to extract cities' positions from a dbf table at https://www.ibge.gov.br/geociencias/organizacao-do-territorio/estrutura-territorial/27385-localidades.html?=&t=acesso-ao-produto
import os
import glob

import matplotlib.pyplot as plt
import itertools

# --- Dengue, Zika, Influenza ---

In [None]:
#Configs
disease = 'zika'
#path = 'C:/Users/your_path/' + disease + '/'
path = 'E:/Doutorado/Trabalhos/Projeto-dengue/data/' + disease + '/raw/'
year = 2018

## Data for each city

###If using dengue of zika data:

In [None]:
lines_to_skip = [0,1,2] + list(range(1143, 5000, 1))   #change according to the file provided in "[Dengue/Zika] Line skips" below

In [None]:
df = pd.read_csv(str(path + 'SINAN_zika_' + str(year) + '.csv'), sep=';', encoding='latin1', skiprows=lines_to_skip)

# === Uncomment the lines below as needed ===
# --- Dengue ---
#df.drop(columns=['Em Branco/ign', 'Total', 'Semana 53'], inplace=True)   #2014
#df.drop(columns=['Em Branco/ign', 'Total'], inplace=True)   #2015, 2021, 2020
#df.drop(columns=['Total'], inplace=True)   #2022, 2019, 2018, 2017, 2016

# --- Zika ---
#df.drop(columns=['Em Branco/ign', 'Total', 'Semana 53'], inplace=True)   #2014
#df.drop(columns=['Em Branco/ign', 'Total'], inplace=True)   #2015, 2021, 2020
df.drop(columns=['Total'], inplace=True)   #2022, 2019, 2018, 2017, 2016

In [None]:
#Reworking city names to move the unique codes to another column.

renamed = []
city_codes = []
names = list(df['Município de notificação'])

for city in range(0, len(names), 1):
  swp = re.split('(\d+)', names[city])  #split city name in '', number and name
  city_codes.append( int(swp[1].strip()) ) #save each city code for later use.
  renamed.append(swp[2].strip()) #remove whitespace before the name of a city and save result

#Drop the original column and replaces it with the above list
n = df.columns[0]   #index of the city list
df.drop(n, axis = 1, inplace = True)
df[n] = renamed
df['cod_city'] = city_codes

In [None]:
#Reorganize the placement of the new name list and show the resulting dataframe
#-> The code is duplicated to move both the city names and codes to the beginning of the dataframe.
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df

cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df

#replace values listed as "-" with NaN, excluding the cities column (some have "-" in their names)
for col in df.columns:
  if col != 'Município de notificação':
    df[col].replace({'-': np.NaN}, regex=True, inplace=True)

### If using influenza data:

Original download from https://opendatasus.saude.gov.br/dataset/srag-2013-2018

In [None]:
df = pd.read_csv(str(path + disease + str(year) + '.csv'), delimiter=';', encoding='latin')

  df = pd.read_csv(str(path + disease + str(year) + '.csv'), delimiter=';', encoding='latin')


In [None]:
df.DT_NOTIFIC = pd.to_datetime(df.DT_NOTIFIC, format="%d/%m/%Y")
df['Year'] = df['DT_NOTIFIC'].apply(lambda time: time.year)
df['Week'] = df['DT_NOTIFIC'].apply(lambda time: time.week)

df.sort_values(by=['Week', 'ID_MUNICIP'], inplace=True)
#somehow there is more than one year in this dataframe
df.drop(columns='NU_ANO', inplace=True)
df = df.loc[df['Year'] == int(year)]

df.CS_SEXO[df.CS_SEXO == 'M'] = 1
df.CS_SEXO[df.CS_SEXO == 'F'] = 1
df.CS_SEXO[df.CS_SEXO == 'I'] = 1
df['CONTAGEM'] = df.groupby(by=['Week', 'ID_MUNICIP'])['CS_SEXO'].transform('sum')

df.sort_values(by=['DT_NOTIFIC'], inplace=True)
df = df[['DT_NOTIFIC', 'ID_MUNICIP', 'Year', 'Week', 'CONTAGEM']]

In [None]:
df

Unnamed: 0,DT_NOTIFIC,ID_MUNICIP,Year,Week,CONTAGEM
25394,2018-01-01,355030,2018,1,7
33623,2018-01-01,410830,2018,1,6
36610,2018-01-01,410830,2018,1,6
30500,2018-01-02,355630,2018,1,1
13747,2018-01-02,320530,2018,1,1
...,...,...,...,...,...
38176,2018-12-31,420540,2018,1,2
42823,2018-12-31,431680,2018,1,1
44132,2018-12-31,500270,2018,1,5
29038,2018-12-31,355030,2018,1,7


## Adding geographic distances

**Important**: Sadly you can't mix the city codes from the IBGE dataset and those from SINAN's. The former have numberings with seven digits (while the latter has six) that do not overlap with latter.

Proof:



```
test = []

for number in list(df_loc['MUN_ORIGEM'].unique()):
  operationTwo = number
  test.append( operationTwo // 10 )

len(set(test) & set(df['cod_city'].unique())) #3710 vs 3774
```



In [None]:
path = 'C:/Users/your_path/'
path = 'E:/Doutorado/Trabalhos/Projeto-dengue/data/IBGE/'
dbf = Dbf5(path + 'BR_Localidades_2010_v1.dbf', codec='latin')
df_loc = dbf.to_dataframe()

In [None]:
#Dictionaty to convert full lenght UF names to codes
#-> for some reason, the original dataframe, df, does not have data for Espirito Santo
uf_code = ['RO', 'AC', 'AM', 'RR', 'PA', 'AP', 'TO', 'MA', 'PI', 'CE', 'RN',
       'PB', 'PE', 'AL', 'SE', 'BA', 'MG', 'ES', 'RJ', 'SP', 'PR', 'SC', 'RS',
       'MS', 'MT', 'GO', 'DF']
uf_names = list(df_loc['NM_UF'].unique())

for i in range(len(uf_code)):
    df_loc['NM_UF'] = df_loc['NM_UF'].replace(uf_names[i],uf_code[i])

In [None]:
#-> Removing accents
cols = df_loc.select_dtypes(include=[object]).columns
df_loc[cols] = df_loc[cols].apply(lambda x: x.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))

In [None]:
df_loc = df_loc.loc[df_loc['NM_CATEGOR'] == 'CIDADE']
df_loc['ID_MUNICIP'] = pd.to_numeric(df_loc['CD_GEOCODM'].str[:-1])

df_loc.sort_values(by=['ID_MUNICIP'], inplace=True)

### Run for Influenza

In [None]:
x_coord = []
y_coord = []
city_to_remove = []

for city in list(df['ID_MUNICIP'].unique()):
  df_city = df_loc.loc[df_loc['ID_MUNICIP'] == city]
  size_city_data = len(df.loc[df['ID_MUNICIP'] == city])
  if (len(df_city) != 0):
    x = df_city['LAT'].values[0]
    y = df_city['LONG'].values[0]
    x_coord.append( np.repeat(x, size_city_data) )
    y_coord.append( np.repeat(y, size_city_data) )
  else:
    city_to_remove.append(city)

x_coord = list(itertools.chain(*x_coord))
y_coord = list(itertools.chain(*y_coord))

for city in city_to_remove:
  df = df[df['ID_MUNICIP'] != city]


#Add the positions found
df['x_coord'] = x_coord
df['y_coord'] = y_coord

df = df[['ID_MUNICIP', 'Week', 'x_coord', 'y_coord', 'CONTAGEM']]

In [None]:
#expand the dataframe. Is there a better way to do that?
weeks_full_year = list(range(1,53))

swp_id = []
swp_xcoord = []
swp_ycoord = []
swp_contagem = []

for city in df['ID_MUNICIP'].unique():
  cont = 0
  list_weeks = df.loc[df['ID_MUNICIP'] == city]['Week'].values
  weeks_cases = df.loc[df['ID_MUNICIP'] == city]['CONTAGEM'].values

  swp_id.append( np.repeat(city, len(weeks_full_year)) )
  swp_xcoord.append( np.repeat(df.loc[df['ID_MUNICIP'] == city]['x_coord'].values[0], len(weeks_full_year)) )
  swp_ycoord.append( np.repeat(df.loc[df['ID_MUNICIP'] == city]['y_coord'].values[0], len(weeks_full_year)) )

  for value in weeks_full_year:
    if value in list_weeks:
      swp_contagem.append( weeks_cases[cont] )
      cont = cont+1
    else:
      swp_contagem.append(0)

swp_year = np.repeat(int('20' + str(year)), len(swp_contagem))
swp_id = list(itertools.chain(*swp_id))
swp_xcoord = list(itertools.chain(*swp_xcoord))
swp_ycoord = list(itertools.chain(*swp_ycoord))

In [None]:
data = {'Year': swp_year, 'cod_city':swp_id, 'week': np.tile(weeks_full_year, len(df['ID_MUNICIP'].unique())),
        'x_coord': swp_xcoord, 'y_coord': swp_ycoord, 'cases':swp_contagem}

df_final = pd.DataFrame(data)

### Run for Dengue/Zika

In [None]:
x_coord = []
y_coord = []
city_to_remove = []

for city in list(df['cod_city'].unique()):
  df_city = df_loc.loc[df_loc['ID_MUNICIP'] == city]
  size_city_data = len(df.loc[df['cod_city'] == city])
  if (len(df_city) != 0):
    x = df_city['LAT'].values[0]
    y = df_city['LONG'].values[0]
    x_coord.append( np.repeat(x, size_city_data) )
    y_coord.append( np.repeat(y, size_city_data) )
  else:
    city_to_remove.append(city)

x_coord = list(itertools.chain(*x_coord))
y_coord = list(itertools.chain(*y_coord))

In [None]:
for city in city_to_remove:
  df = df[df['cod_city'] != city]

In [None]:
#rename weeks to a easier to access format.
#-> The other variables (and those from the INMET dataframes) will be translated in another notebook.
cols = list(df.select_dtypes(include=[object]).columns)[1:]

for week in range(0, len(cols), 1):
  df.rename(columns={cols[week]: str('w_'+str(week+1))}, inplace=True)

In [None]:
#Add the positions found in the previous cell
df['x_coord'] = x_coord
df['y_coord'] = y_coord

df = df[['cod_city', 'x_coord', 'y_coord', 'w_1', 'w_2', 'w_3', 'w_4', 'w_5', 'w_6',
       'w_7', 'w_8', 'w_9', 'w_10', 'w_11', 'w_12', 'w_13', 'w_14', 'w_15',
       'w_16', 'w_17', 'w_18', 'w_19', 'w_20', 'w_21', 'w_22', 'w_23', 'w_24',
       'w_25', 'w_26', 'w_27', 'w_28', 'w_29', 'w_30', 'w_31', 'w_32', 'w_33',
       'w_34', 'w_35', 'w_36', 'w_37', 'w_38', 'w_39', 'w_40', 'w_41', 'w_42',
       'w_43', 'w_44', 'w_45', 'w_46', 'w_47', 'w_48', 'w_49', 'w_50', 'w_51',
       'w_52']]

## Saving results

In [None]:
#Done! Let's save the resulting data.
df.to_csv(str(path + '/SINAN_' + disease + 'weekly_' + str(year) + '.csv'))

##[Dengue/Zika] Line skips

### Dengue

In [None]:
#2007:
lines_to_skip = [0,1,2] + list(range(3559, 5000, 1))

#2008:
lines_to_skip = [0,1,2] + list(range(3363, 5000, 1))

#2009:
lines_to_skip = [0,1,2] + list(range(3041, 5000, 1))

#2010:
lines_to_skip = [0,1,2] + list(range(3939, 5000, 1))

#2011:
lines_to_skip = [0,1,2] + list(range(3841, 5000, 1))

#2012:
lines_to_skip = [0,1,2] + list(range(3567, 5000, 1))

#2013:
lines_to_skip = [0,1,2] + list(range(4221, 5000, 1))

#2014:
lines_to_skip = [0,1,2] + list(range(3694, 5000, 1))

#2015:
lines_to_skip = [0,1,2] + list(range(4365, 5000, 1))

#2016:
lines_to_skip = [0,1,2] + list(range(4550, 5000, 1))

#2017:
lines_to_skip = [0,1,2] + list(range(3268, 5000, 1))

#2018:
lines_to_skip = [0,1,2] + list(range(3168, 5000, 1))

#2019:
lines_to_skip = [0,1,2] + list(range(4339, 5000, 1))

#2020:
lines_to_skip = [0,1,2] + list(range(4080, 5000, 1))

#2021:
lines_to_skip = [0,1,2] + list(range(3778, 5000, 1))

### Zika

In [None]:
#2016:
lines_to_skip = [0,1,2] + list(range(2375, 5000, 1))

#2017:
lines_to_skip = [0,1,2] + list(range(1258, 5000, 1))

#2018:
lines_to_skip = [0,1,2] + list(range(1143, 5000, 1))

#2019:
lines_to_skip = [0,1,2] + list(range(1463, 5000, 1))

#2020:
lines_to_skip = [0,1,2] + list(range(1094, 5000, 1))

#2021:
lines_to_skip = [0,1,2] + list(range(941, 5000, 1))

#2022:
lines_to_skip = [0,1,2] + list(range(1159, 5000, 1))

#2023:
lines_to_skip = [0,1,2] + list(range(1057, 5000, 1))

# --- COVID19 ----

In [None]:
path = "C:/Users/your_path/"
file_list = [os.path.normpath(i) for i in glob.glob(path + "/*.csv")]
main_dataframe = pd.read_csv(file_list[0], delimiter=';')
for i in range(1,len(file_list)):
    df = pd.read_csv(file_list[i], delimiter=';')
    main_dataframe = pd.concat([main_dataframe,df],axis=0)

In [None]:
main_dataframe.columns

Index(['regiao', 'estado', 'municipio', 'coduf', 'codmun', 'codRegiaoSaude',
       'nomeRegiaoSaude', 'data', 'semanaEpi', 'populacaoTCU2019',
       'casosAcumulado', 'casosNovos', 'obitosAcumulado', 'obitosNovos',
       'Recuperadosnovos', 'emAcompanhamentoNovos', 'interior/metropolitana'],
      dtype='object')

In [None]:
main_dataframe = main_dataframe[['estado', 'municipio', 'codmun', 'data',
                                 'semanaEpi','casosAcumulado', 'casosNovos']]
main_dataframe.dropna(subset=['estado','municipio', 'codmun'], inplace=True)

main_dataframe.set_index('data', inplace=True)
main_dataframe.index = pd.to_datetime(main_dataframe.index)

In [None]:
data_to_concatenate = []
for column in main_dataframe.columns[4:]:
  grouped = main_dataframe.groupby('codmun').resample('1W')[column].sum()
  data_to_concatenate.append(grouped.to_frame())

res = pd.concat(data_to_concatenate, axis=1)

res = res.reset_index(level=[0,1])
res['year'] = pd.DatetimeIndex(res['data']).year
res['week'] = res['data'].dt.strftime("%U")
res['week'] = pd.to_numeric(res['week'])

In [None]:
res.rename(columns={'codmun': 'cod_city'})
res = res[['cod_city', 'year', 'week', 'data', 'casosAcumulado', 'casosNovos']]
res['casosNovos'].mask(res['casosNovos'] <0 , 0, inplace=True)  #remove negative values. There seems to be a bug in the original data which needs to be fixed.

res.to_csv(path + 'covid_2020_2023.csv')