In [1]:
import pandas as pd

In [2]:
# Seleccionando el dataframe de casos y muertes por millón, por país y por fecha

countries = pd.read_csv("Proyecto/countries.csv")

In [3]:
countries.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,24/02/2020,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,25/02/2020,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,26/02/2020,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,27/02/2020,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,28/02/2020,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [4]:
# Seleccionando las columnas de interés

paises = pd.DataFrame(countries.loc[:, ["continent",
                       "location",
                       "date",
                       'total_cases_per_million', 
                       "total_deaths_per_million"]])

In [5]:
paises.head()

Unnamed: 0,continent,location,date,total_cases_per_million,total_deaths_per_million
0,Asia,Afghanistan,24/02/2020,0.126,
1,Asia,Afghanistan,25/02/2020,0.126,
2,Asia,Afghanistan,26/02/2020,0.126,
3,Asia,Afghanistan,27/02/2020,0.126,
4,Asia,Afghanistan,28/02/2020,0.126,


In [6]:
total_deaths_permillion2 = paises[paises["total_deaths_per_million"].notna()]
total_deaths_permillion2

Unnamed: 0,continent,location,date,total_cases_per_million,total_deaths_per_million
28,Asia,Afghanistan,23/03/2020,1.004,0.025
29,Asia,Afghanistan,24/03/2020,1.054,0.025
30,Asia,Afghanistan,25/03/2020,1.858,0.025
31,Asia,Afghanistan,26/03/2020,2.008,0.050
32,Asia,Afghanistan,27/03/2020,2.284,0.050
...,...,...,...,...,...
138936,Africa,Zimbabwe,05/12/2021,9213.121,312.082
138937,Africa,Zimbabwe,06/12/2021,9213.121,312.082
138938,Africa,Zimbabwe,07/12/2021,9382.414,312.281
138939,Africa,Zimbabwe,08/12/2021,9980.539,312.745


In [7]:
paises.dtypes

# Aquí vemos que date es objeto

continent                    object
location                     object
date                         object
total_cases_per_million     float64
total_deaths_per_million    float64
dtype: object

In [8]:
from datetime import date, datetime, timedelta as td

In [9]:
# casting de "date" (convertir de objeto a fecha)

paises["date"]  = pd.to_datetime(paises["date"], format="%d/%m/%Y")

In [10]:
paises.dtypes

continent                           object
location                            object
date                        datetime64[ns]
total_cases_per_million            float64
total_deaths_per_million           float64
dtype: object

In [11]:
# Función para obtener sólo el año en otra columna 
get_year = lambda x: x.year

In [12]:
# Mapping para obtener sólo el año en otra columna 
paises["year"] = paises["date"].map(get_year)

In [13]:
# Obtuvimos la columna nueva year

paises.tail()

Unnamed: 0,continent,location,date,total_cases_per_million,total_deaths_per_million,year
138936,Africa,Zimbabwe,2021-12-05,9213.121,312.082,2021
138937,Africa,Zimbabwe,2021-12-06,9213.121,312.082,2021
138938,Africa,Zimbabwe,2021-12-07,9382.414,312.281,2021
138939,Africa,Zimbabwe,2021-12-08,9980.539,312.745,2021
138940,Africa,Zimbabwe,2021-12-09,10324.36,312.944,2021


In [14]:
# Vamos a obtener las medias globales de casos COVID-19 por año
media_casos = paises.groupby(["year"])["total_cases_per_million"].mean()
media_casos

year
2020     4513.792138
2021    35951.466033
Name: total_cases_per_million, dtype: float64

In [15]:
# Conteo de casos COVID-19 por año
media_c = paises.groupby(["year"])["total_cases_per_million"].count()
media_c

year
2020    60567
2021    69967
Name: total_cases_per_million, dtype: int64

In [76]:
# Vamos a obtener las medias globales de muertes por COVID-19 por año
media_muertes = paises.groupby(["year"])["total_deaths_per_million"].mean()
media_muertes

year
2020    126.804042
2021    661.859237
Name: total_deaths_per_million, dtype: float64

In [77]:
mediana_casos = paises.groupby(["year"])["total_cases_per_million"].median()
mediana_casos

year
2020      706.005
2021    17815.200
Name: total_cases_per_million, dtype: float64

In [78]:
mediana_muertes = paises.groupby(["year"])["total_deaths_per_million"].median()
mediana_muertes

year
2020     22.071
2021    294.326
Name: total_deaths_per_million, dtype: float64

In [30]:
from scipy import stats

In [31]:
# Queremos calcular las medias truncadas al 10%
# Necesitamos una función lambda porque tenemos grupos (por año)

truncar_media = lambda x: stats.trim_mean(x, 0.1)

In [32]:
truncada_casos = paises.groupby(["year"])["total_cases_per_million"].agg(truncar_media)
truncada_casos

year
2020     2408.418262
2021    39747.084074
Name: total_cases_per_million, dtype: float64

In [33]:
paises2 = paises[paises["total_deaths_per_million"].notna()]
truncada_muertes = paises2.groupby(["year"])["total_deaths_per_million"].agg(truncar_media)
truncada_muertes

year
2020     64.237573
2021    515.987343
Name: total_deaths_per_million, dtype: float64