In [1]:
import json
import pandas as pd

In [2]:
# dataset home page: https://brasil.io/dataset/covid19/caso
# csv listed at: https://data.brasil.io/dataset/covid19/_meta/list.html
data = pd.read_csv("https://data.brasil.io/dataset/covid19/caso_full.csv.gz")

# 1. Inspect the data

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11193 entries, 0 to 11192
Data columns (total 15 columns):
city                                             11193 non-null object
city_ibge_code                                   10906 non-null float64
date                                             11193 non-null object
estimated_population_2019                        10906 non-null float64
is_repeated                                      11193 non-null bool
is_last                                          11193 non-null bool
last_available_confirmed                         11193 non-null int64
last_available_confirmed_per_100k_inhabitants    10761 non-null float64
last_available_date                              11193 non-null object
last_available_death_rate                        1702 non-null float64
last_available_deaths                            10783 non-null float64
place_type                                       11193 non-null object
state                                   

In [4]:
data['new_confirmed'].sum(), data['new_deaths'].sum()

(19592, 1045)

In [5]:
data.groupby(['place_type']).size()

place_type
city    11193
dtype: int64

In [6]:
data.groupby(['is_repeated']).size()

is_repeated
False    9787
True     1406
dtype: int64

In [7]:
data.loc[0:1, ["date", "city", "place_type", "is_repeated", "last_available_date", 
               "last_available_confirmed", "last_available_deaths", "new_confirmed", "new_deaths"]]

Unnamed: 0,date,city,place_type,is_repeated,last_available_date,last_available_confirmed,last_available_deaths,new_confirmed,new_deaths
0,2020-02-25,São Paulo,city,False,2020-02-25,1,,0,0
1,2020-02-26,São Paulo,city,True,2020-02-25,1,,0,0


In [8]:
print(data.head(2))

        city  city_ibge_code        date  estimated_population_2019  \
0  São Paulo       3550308.0  2020-02-25                 12252023.0   
1  São Paulo       3550308.0  2020-02-26                 12252023.0   

   is_repeated  is_last  last_available_confirmed  \
0        False    False                         1   
1         True    False                         1   

   last_available_confirmed_per_100k_inhabitants last_available_date  \
0                                        0.00816          2020-02-25   
1                                        0.00816          2020-02-25   

   last_available_death_rate  last_available_deaths place_type state  \
0                        NaN                    NaN       city    SP   
1                        NaN                    NaN       city    SP   

   new_confirmed  new_deaths  
0              0           0  
1              0           0  


In [9]:
data.groupby(["date"])["new_confirmed", "new_deaths"].sum()

Unnamed: 0_level_0,new_confirmed,new_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-25,0,0
2020-02-26,0,0
2020-02-27,0,0
2020-02-28,1,0
2020-02-29,0,0
2020-03-01,0,0
2020-03-02,0,0
2020-03-03,0,0
2020-03-04,1,0
2020-03-05,4,0


In [10]:
data[data["state"] == "CE"].groupby(["date"])["new_deaths", "last_available_deaths", "new_confirmed", 
                                              "last_available_confirmed"].sum()

Unnamed: 0_level_0,new_deaths,last_available_deaths,new_confirmed,last_available_confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-16,0,0.0,9,9
2020-03-17,0,0.0,1,10
2020-03-18,0,0.0,9,19
2020-03-19,0,0.0,1,20
2020-03-20,0,0.0,48,68
2020-03-21,0,0.0,16,84
2020-03-22,0,0.0,41,125
2020-03-23,0,0.0,39,164
2020-03-24,0,0.0,21,185
2020-03-25,0,0.0,26,211


In [11]:
data.loc[(data["new_deaths"] < 0) | (data["new_confirmed"] < 0), 
         ["date", "city", "last_available_date", "last_available_confirmed", "last_available_deaths",  "new_confirmed", "new_deaths"]]

Unnamed: 0,date,city,last_available_date,last_available_confirmed,last_available_deaths,new_confirmed,new_deaths
288,2020-03-18,Jaboatão dos Guararapes,2020-03-18,1,,-1,0
329,2020-03-18,São Paulo,2020-03-18,214,,58,-1
414,2020-03-19,Santo André,2020-03-19,2,,-4,0
417,2020-03-19,São Caetano do Sul,2020-03-19,1,,-5,0
465,2020-03-20,Uberlândia,2020-03-20,1,0.0,-1,0
568,2020-03-21,Importados/Indefinidos,2020-03-21,0,0.0,-1,0
577,2020-03-21,Juiz de Fora,2020-03-21,4,0.0,-1,0
668,2020-03-21,Importados/Indefinidos,2020-03-21,0,0.0,-8,0
686,2020-03-22,Importados/Indefinidos,2020-03-22,0,,-7,0
711,2020-03-22,Aparecida de Goiânia,2020-03-22,1,0.0,-1,0


# 2. Fix manually

# 3. Create `.json` with summarized data

In [12]:
data.rename(columns={"last_available_deaths": "deaths", "last_available_confirmed": "confirmed"}, inplace=True)

In [13]:
data_summary = data.groupby(['state', 'date'])['deaths', 'confirmed'].sum().reset_index()

data_summary["deaths"] = data_summary["deaths"].astype(int)
data_summary["confirmed"] = data_summary["confirmed"].astype(int)

In [14]:
data_summary.set_index("date", inplace=True)

In [15]:
data_summary.head(3)

Unnamed: 0_level_0,state,deaths,confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-03-17,AC,0,3
2020-03-18,AC,0,3
2020-03-19,AC,0,4


In [16]:
data_summary.tail(10)

Unnamed: 0_level_0,state,deaths,confirmed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-04-02,TO,0,12
2020-04-03,TO,0,13
2020-04-04,TO,0,16
2020-04-05,TO,0,17
2020-04-06,TO,0,19
2020-04-07,TO,0,19
2020-04-08,TO,0,23
2020-04-09,TO,0,23
2020-04-10,TO,0,23
2020-04-11,TO,0,23


In [17]:
data_summary.loc[data_summary["state"] == "CE", "confirmed"].values

array([   9,   10,   19,   20,   68,   84,  125,  164,  185,  211,  237,
        282,  322,  359,  382,  401,  445,  564,  658,  746,  964, 1047,
       1190, 1190, 1190, 1190, 1190])

In [18]:
data_summary.loc[data_summary["state"] == "CE", "confirmed"].index

Index(['2020-03-16', '2020-03-17', '2020-03-18', '2020-03-19', '2020-03-20',
       '2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24', '2020-03-25',
       '2020-03-26', '2020-03-27', '2020-03-28', '2020-03-29', '2020-03-30',
       '2020-03-31', '2020-04-01', '2020-04-02', '2020-04-03', '2020-04-04',
       '2020-04-05', '2020-04-06', '2020-04-07', '2020-04-08', '2020-04-09',
       '2020-04-10', '2020-04-11'],
      dtype='object', name='date')

In [19]:
data_summary.loc[data_summary["state"] == "CE", "deaths"].index

Index(['2020-03-16', '2020-03-17', '2020-03-18', '2020-03-19', '2020-03-20',
       '2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24', '2020-03-25',
       '2020-03-26', '2020-03-27', '2020-03-28', '2020-03-29', '2020-03-30',
       '2020-03-31', '2020-04-01', '2020-04-02', '2020-04-03', '2020-04-04',
       '2020-04-05', '2020-04-06', '2020-04-07', '2020-04-08', '2020-04-09',
       '2020-04-10', '2020-04-11'],
      dtype='object', name='date')

In [20]:
output = [{"state_name": "Acre", "state_code": "AC", "region": "N", "confirmed":[], "deaths":[]},
          {"state_name": "Alagoas", "state_code": "AL", "region": "NE", "confirmed":[], "deaths":[]},
          {"state_name": "Amazonas", "state_code": "AM", "region": "N", "confirmed":[], "deaths":[]},
          {"state_name": "Amapá", "state_code": "AP", "region": "N", "confirmed":[], "deaths":[]},
          {"state_name": "Bahia", "state_code": "BA", "region": "NE", "confirmed":[], "deaths":[]},
          {"state_name": "Ceará", "state_code": "CE", "region": "NE", "confirmed":[], "deaths":[]},
          {"state_name": "Distrito Federal", "state_code": "DF", "region": "CO", "confirmed":[], "deaths":[]},
          {"state_name": "Espírito Santo", "state_code": "ES", "region": "SE", "confirmed":[], "deaths":[]},
          {"state_name": "Goiás", "state_code": "GO", "region": "CO", "confirmed":[], "deaths":[]},
          {"state_name": "Maranhão", "state_code": "MA", "region": "NE", "confirmed":[], "deaths":[]},
          {"state_name": "Minas Gerais", "state_code": "MG", "region": "SE", "confirmed":[], "deaths":[]},
          {"state_name": "Mato Grosso do Sul", "state_code": "MS", "region": "CO", "confirmed":[], "deaths":[]},
          {"state_name": "Mato Grosso", "state_code": "MT", "region": "CO", "confirmed":[], "deaths":[]},
          {"state_name": "Pará", "state_code": "PA", "region": "N", "confirmed":[], "deaths":[]},
          {"state_name": "Paraíba", "state_code": "PB", "region": "NE", "confirmed":[], "deaths":[]},
          {"state_name": "Pernambuco", "state_code": "PE", "region": "NE", "confirmed":[], "deaths":[]},
          {"state_name": "Piauí", "state_code": "PI", "region": "NE", "confirmed":[], "deaths":[]},
          {"state_name": "Paraná", "state_code": "PR", "region": "S", "confirmed":[], "deaths":[]},
          {"state_name": "Rio de Janeiro", "state_code": "RJ", "region": "SE", "confirmed":[], "deaths":[]},
          {"state_name": "Rio Grande do Norte", "state_code": "RN", "region": "NE", "confirmed":[], "deaths":[]},
          {"state_name": "Rondônia", "state_code": "RO", "region": "N", "confirmed":[], "deaths":[]},
          {"state_name": "Roraima", "state_code": "RR", "region": "N", "confirmed":[], "deaths":[]},
          {"state_name": "Rio Grande do Sul", "state_code": "RS", "region": "S", "confirmed":[], "deaths":[]},
          {"state_name": "Santa Catarina", "state_code": "SC", "region": "S", "confirmed":[], "deaths":[]},          
          {"state_name": "Sergipe", "state_code": "SE", "region": "NE", "confirmed":[], "deaths":[]},          
          {"state_name": "São Paulo", "state_code": "SP", "region": "SE", "confirmed":[], "deaths":[]},          
          {"state_name": "Tocantins", "state_code": "TO", "region": "N", "confirmed":[], "deaths":[]},                    
         ]

In [21]:
for item in output:
    code = item["state_code"]
    
    dates_conf = data_summary.loc[data_summary["state"] == code, "confirmed"].index
    confirmed = data_summary.loc[data_summary["state"] == code, "confirmed"].values
    dates_deaths = data_summary.loc[data_summary["state"] == code, "deaths"].index
    deaths = data_summary.loc[data_summary["state"] == code, "deaths"].values
        
    item["confirmed"] = [[date, int(value)] for date, value in zip(dates_conf, confirmed)]
    item["deaths"] = [[date, int(value)] for date, value in zip(dates_deaths, deaths)]

In [22]:
print(output)

[{'state_name': 'Acre', 'state_code': 'AC', 'region': 'N', 'confirmed': [['2020-03-17', 3], ['2020-03-18', 3], ['2020-03-19', 4], ['2020-03-20', 7], ['2020-03-21', 11], ['2020-03-22', 11], ['2020-03-23', 17], ['2020-03-24', 21], ['2020-03-25', 23], ['2020-03-26', 23], ['2020-03-27', 25], ['2020-03-28', 25], ['2020-03-29', 34], ['2020-03-30', 41], ['2020-03-31', 42], ['2020-04-01', 43], ['2020-04-02', 45], ['2020-04-03', 46], ['2020-04-04', 46], ['2020-04-05', 48], ['2020-04-06', 50], ['2020-04-07', 50], ['2020-04-08', 58], ['2020-04-09', 62], ['2020-04-10', 70], ['2020-04-11', 70]], 'deaths': [['2020-03-17', 0], ['2020-03-18', 0], ['2020-03-19', 0], ['2020-03-20', 0], ['2020-03-21', 0], ['2020-03-22', 0], ['2020-03-23', 0], ['2020-03-24', 0], ['2020-03-25', 0], ['2020-03-26', 0], ['2020-03-27', 0], ['2020-03-28', 0], ['2020-03-29', 0], ['2020-03-30', 0], ['2020-03-31', 0], ['2020-04-01', 0], ['2020-04-02', 0], ['2020-04-03', 0], ['2020-04-04', 0], ['2020-04-05', 0], ['2020-04-06', 1], 

In [23]:
with open('state_summary.json', 'w') as f:
    json.dump(output, f)