# Analisis exploratorio de Datos

Notebook basado y modificado del original de [Madhan Chandrasekharan](https://www.kaggle.com/gcmadhan/covid-vaccine-eda-visualization)

## Importación de módulos 

In [1]:
# import numpy as np # linear algebra
import pandas as pd

## Carga de datos 

In [2]:
# Levantamos el CSV desde una dirección local
# df = pd.read_csv('data/country_vaccinations.csv')

# Levantamos el CSV desde Github
df = pd.read_csv('https://raw.githubusercontent.com/matog/FlacsoBigData/main/data/country_vaccinations.csv')

## Exploración 

In [None]:
df.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,35.0,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,35.0,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,35.0,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,35.0,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21628 entries, 0 to 21627
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   country                              21628 non-null  object        
 1   iso_code                             21628 non-null  object        
 2   date                                 21628 non-null  datetime64[ns]
 3   total_vaccinations                   12275 non-null  float64       
 4   people_vaccinated                    11479 non-null  float64       
 5   people_fully_vaccinated              8967 non-null   float64       
 6   daily_vaccinations_raw               10187 non-null  float64       
 7   daily_vaccinations                   21412 non-null  float64       
 8   total_vaccinations_per_hundred       12275 non-null  float64       
 9   people_vaccinated_per_hundred        11479 non-null  float64       
 10  people_ful

In [None]:
df.rename(columns = {'source_name': 'fuente',
                     'source_website': 'fuente - sitio web',
                     'vaccines':'Nombre de vacuna'
                    },
                inplace = True)

In [None]:
df.head()

In [None]:
# El campo 'date' se cargó como type:object, y lo convertimos a type:date, para poder trabajarlo como una fecha.
df['date']=pd.to_datetime(df['date'])

In [None]:
type(df['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
# Listamos las variables y los valores nulos de cada uno.
df.isna().sum()

country                                    0
iso_code                                   0
date                                       0
total_vaccinations                      9353
people_vaccinated                      10149
people_fully_vaccinated                12661
daily_vaccinations_raw                 11441
daily_vaccinations                       216
total_vaccinations_per_hundred          9353
people_vaccinated_per_hundred          10149
people_fully_vaccinated_per_hundred    12661
daily_vaccinations_per_million           216
vaccines                                   0
source_name                                0
source_website                             0
dtype: int64

In [None]:
# "Forma" del dataframe
df.shape

(21628, 15)

In [None]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
# Listado de paises y registros
df['country'].value_counts()

Norway                             180
Scotland                           172
Canada                             169
China                              168
Russia                             168
                                  ... 
Tuvalu                               1
Bonaire Sint Eustatius and Saba      1
Cook Islands                         1
Turkmenistan                         1
Guinea-Bissau                        1
Name: country, Length: 214, dtype: int64

In [None]:
# La variable 'country' no tiene valores nulos
df['country'].isna().sum()

0

In [None]:
#Listado por vacunas y registros
df['vaccines'].value_counts()

Oxford/AstraZeneca                                                                    3774
Johnson&Johnson, Moderna, Oxford/AstraZeneca, Pfizer/BioNTech                         3036
Moderna, Oxford/AstraZeneca, Pfizer/BioNTech                                          1687
Oxford/AstraZeneca, Pfizer/BioNTech                                                   1620
Moderna, Pfizer/BioNTech                                                              1276
Pfizer/BioNTech                                                                       1240
Oxford/AstraZeneca, Sinopharm/Beijing                                                 1086
Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac                                          1054
Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing, Sputnik V                      927
Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing                                 416
Oxford/AstraZeneca, Sinopharm/Beijing, Sputnik V                                       404

In [14]:
# Filtramos los que paises que en su combo de vacunación, incluyen Sputnik V
# df[df['vaccines'].str.contains("Sputnik V")]
df[df['vaccines'].str.contains("Sputnik V")]['country'].unique()

# También podemos contar la cantidad de paises que vacunaron con Sputnik V
df[df['vaccines'].str.contains("Sputnik V")]['country'].nunique()

43

In [8]:
# Desarmamos la variable 'vaccines' en una fila por vacuna

# Primero generamos una lista de vacunas (en el sentido de lista de python), y luego solo aplicamos explode
df1 = df.assign(lab=df['vaccines'].str.split(',')).explode('lab')
df1[['country', 'lab']]

Unnamed: 0,country,lab
0,Afghanistan,Oxford/AstraZeneca
0,Afghanistan,Pfizer/BioNTech
0,Afghanistan,Sinopharm/Beijing
1,Afghanistan,Oxford/AstraZeneca
1,Afghanistan,Pfizer/BioNTech
...,...,...
21623,Zimbabwe,Sinopharm/Beijing
21624,Zimbabwe,Sinopharm/Beijing
21625,Zimbabwe,Sinopharm/Beijing
21626,Zimbabwe,Sinopharm/Beijing


In [None]:
df.groupby(['vaccines'])['country'].count().sort_values(ascending=False)

vaccines
Oxford/AstraZeneca                                                                    3774
Johnson&Johnson, Moderna, Oxford/AstraZeneca, Pfizer/BioNTech                         3036
Moderna, Oxford/AstraZeneca, Pfizer/BioNTech                                          1687
Oxford/AstraZeneca, Pfizer/BioNTech                                                   1620
Moderna, Pfizer/BioNTech                                                              1276
Pfizer/BioNTech                                                                       1240
Oxford/AstraZeneca, Sinopharm/Beijing                                                 1086
Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac                                          1054
Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing, Sputnik V                      927
Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing                                 416
Oxford/AstraZeneca, Sinopharm/Beijing, Sputnik V                                 

In [None]:
df.describe()

Unnamed: 0,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
count,12275.0,11479.0,8967.0,10187.0,21412.0,12275.0,11479.0,8967.0,21412.0
mean,7123137.0,4018807.0,2193614.0,170746.4,88799.45,20.361753,14.217948,7.84709,3221.901877
std,32251430.0,14919830.0,9299795.0,886019.9,562588.4,27.554987,17.473412,11.907316,8202.956508
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80293.0,66401.0,29461.0,3477.0,821.0,1.86,1.725,0.82,352.75
50%,572662.0,441120.0,235423.0,18236.0,5892.0,8.91,6.87,3.23,1615.0
75%,2643352.0,1825397.0,946562.5,72207.5,31480.0,28.025,20.57,9.6,4485.25
max,661468000.0,167734000.0,135087300.0,22296000.0,19173570.0,230.09,116.0,114.09,1000000.0


In [None]:
df.groupby(['vaccines','country']).agg(
    **{
        'Cantidad Registros': ('iso_code', 'count'),
        'Vacunados promedio por fecha': ('daily_vaccinations', 'mean')
    }).reset_index()

Unnamed: 0,vaccines,country,Cantidad Registros,Vacunados promedio por fecha
0,"Abdala, Soberana02",Cuba,17,63403.31
1,"CanSino, Oxford/AstraZeneca, Pfizer/BioNTech, ...",Mexico,159,186391.42
2,"CanSino, Oxford/AstraZeneca, Sinopharm/Beijing...",Pakistan,115,48023.38
3,"CanSino, Sinopharm/Beijing, Sinopharm/Wuhan, S...",China,168,3606363.98
4,"Covaxin, Oxford/AstraZeneca",Central African Republic,20,934.00
...,...,...,...,...
209,"Sinopharm/Beijing, Sputnik V",Kyrgyzstan,52,972.98
210,"Sinopharm/Beijing, Sputnik V",Laos,76,10245.92
211,Sputnik V,Algeria,22,3289.05
212,Sputnik V,Guinea,62,4174.90


In [None]:
# Corregimos el formato de los números
pd.options.display.float_format = '{:,.2f}'.format

## Análisis gráfico

In [72]:
# Importamos los paquetes para poder graficas
import plotly.express as px
import plotly.graph_objects as go

In [75]:
df.columns

Index(['country', 'iso_code', 'date', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'daily_vaccinations_per_million',
       'vaccines', 'source_name', 'source_website'],
      dtype='object')

###  Grafico I. Personas con el proceso de vacunación finalizado por país

In [76]:
data1=df.groupby(['country']).sum().reset_index()

In [77]:
data1.head()

Unnamed: 0,country,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,Afghanistan,3831799.0,3256958.0,574841.0,2859.0,594456.0,9.85,8.36,1.48,15285.0
1,Albania,24787760.0,6990919.0,3550624.0,552252.0,760906.0,861.34,242.91,123.37,264395.0
2,Algeria,75030.0,0.0,0.0,30.0,69070.0,0.17,0.0,0.0,1568.0
3,Andorra,249370.0,156305.0,31652.0,4802.0,31649.0,322.74,202.31,40.98,409618.0
4,Angola,5386560.0,4717488.0,669072.0,0.0,880797.0,16.39,14.35,2.04,26808.0


In [78]:
fig = px.bar(data1.sort_values(by=['people_vaccinated'],
                         ascending=False).reset_index(),
       x='country',
       y='people_vaccinated',
       title='Personas con el proceso de vacunación finalizado por país',
       color='country'
            )
fig.update_layout(
    font_family="Roboto",
)
fig.show()

In [79]:
data1.head() 

Unnamed: 0,country,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,Afghanistan,3831799.0,3256958.0,574841.0,2859.0,594456.0,9.85,8.36,1.48,15285.0
1,Albania,24787760.0,6990919.0,3550624.0,552252.0,760906.0,861.34,242.91,123.37,264395.0
2,Algeria,75030.0,0.0,0.0,30.0,69070.0,0.17,0.0,0.0,1568.0
3,Andorra,249370.0,156305.0,31652.0,4802.0,31649.0,322.74,202.31,40.98,409618.0
4,Angola,5386560.0,4717488.0,669072.0,0.0,880797.0,16.39,14.35,2.04,26808.0


### Gráfico II - Vacunas por laboratorio

In [80]:
data2=df.groupby(['vaccines','country']).sum().reset_index()

In [81]:
data2

Unnamed: 0,vaccines,country,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,"Abdala, Soberana02",Cuba,7.972472e+06,7972472.0,0.0,196123.0,1014453.0,70.41,70.41,0.00,89565.0
1,"CanSino, Oxford/AstraZeneca, Pfizer/BioNTech, ...",Mexico,1.181931e+09,842814629.0,386252432.0,25950934.0,29449845.0,916.66,653.69,299.60,228412.0
2,"CanSino, Oxford/AstraZeneca, Sinopharm/Beijing...",Pakistan,4.492630e+07,28806863.0,7066652.0,562254.0,5474665.0,20.33,13.02,3.20,24782.0
3,"CanSino, Sinopharm/Beijing, Sinopharm/Wuhan, S...",China,2.018469e+10,0.0,0.0,581008000.0,602262785.0,1402.38,0.00,0.00,418426.0
4,"Covaxin, Oxford/AstraZeneca",Central African Republic,1.907500e+04,18358.0,717.0,0.0,17746.0,0.39,0.38,0.01,3667.0
...,...,...,...,...,...,...,...,...,...,...,...
209,"Sinopharm/Beijing, Sputnik V",Kyrgyzstan,1.286790e+05,67710.0,6868.0,205.0,49622.0,1.97,1.03,0.10,7608.0
210,"Sinopharm/Beijing, Sputnik V",Laos,9.145011e+06,7457983.0,1687028.0,135127.0,768444.0,125.69,102.49,23.19,105621.0
211,Sputnik V,Algeria,7.503000e+04,0.0,0.0,30.0,69070.0,0.17,0.00,0.00,1568.0
212,Sputnik V,Guinea,5.131054e+06,4275131.0,1505375.0,102088.0,246319.0,39.08,32.55,11.46,18756.0


In [82]:
fig = go.Figure()
fig= px.bar(data2.sort_values(by=['people_fully_vaccinated_per_hundred'],
                         ascending=False).reset_index(),
       x='vaccines',
       y='people_fully_vaccinated_per_hundred',
       title='Vacunas por laboratorio',
       color='vaccines',
#        hover_name='country'
           )
fig.update_layout(
    font_family="Roboto",
)
fig.show()

### Gráfico III - Evolución de las personas vacunadas por país 
Vamos a tomar los 5 paises con mas vacunas entregadas, y analizamos su evolución temporal

In [109]:
df_aux=df.groupby(['country'])['total_vaccinations'].sum().sort_values(ascending = False).head().reset_index()
df_aux

Unnamed: 0,country,total_vaccinations
0,China,20184690000.0
1,United States,19368350000.0
2,India,9902768000.0
3,United Kingdom,4406375000.0
4,England,3719467000.0


In [110]:
paises = df_aux['country'].unique()
df_graph = df.loc[df['country'].isin(paises)]

In [111]:
df_graph

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
3805,China,CHN,2020-12-15,1500000.0,,,,,0.10,,,,"CanSino, Sinopharm/Beijing, Sinopharm/Wuhan, S...",National Health Commission,http://www.nhc.gov.cn/xcs/yqjzqk/202106/6c7027...
3806,China,CHN,2020-12-16,,,,,187500.0,,,,130.0,"CanSino, Sinopharm/Beijing, Sinopharm/Wuhan, S...",National Health Commission,http://www.nhc.gov.cn/xcs/yqjzqk/202106/6c7027...
3807,China,CHN,2020-12-17,,,,,187500.0,,,,130.0,"CanSino, Sinopharm/Beijing, Sinopharm/Wuhan, S...",National Health Commission,http://www.nhc.gov.cn/xcs/yqjzqk/202106/6c7027...
3808,China,CHN,2020-12-18,,,,,187500.0,,,,130.0,"CanSino, Sinopharm/Beijing, Sinopharm/Wuhan, S...",National Health Commission,http://www.nhc.gov.cn/xcs/yqjzqk/202106/6c7027...
3809,China,CHN,2020-12-19,,,,,187500.0,,,,130.0,"CanSino, Sinopharm/Beijing, Sinopharm/Wuhan, S...",National Health Commission,http://www.nhc.gov.cn/xcs/yqjzqk/202106/6c7027...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20918,United States,USA,2021-05-26,289212304.0,165074907.0,131850089.0,1423432.0,1703162.0,86.48,49.36,39.42,5093.0,"Johnson&Johnson, Moderna, Pfizer/BioNTech",Centers for Disease Control and Prevention,https://covid.cdc.gov/covid-data-tracker/#vacc...
20919,United States,USA,2021-05-27,290724607.0,165718717.0,132769894.0,1512303.0,1618194.0,86.93,49.55,39.70,4838.0,"Johnson&Johnson, Moderna, Pfizer/BioNTech",Centers for Disease Control and Prevention,https://covid.cdc.gov/covid-data-tracker/#vacc...
20920,United States,USA,2021-05-28,292099778.0,166388129.0,133532544.0,1375171.0,1500632.0,87.34,49.75,39.93,4487.0,"Johnson&Johnson, Moderna, Pfizer/BioNTech",Centers for Disease Control and Prevention,https://covid.cdc.gov/covid-data-tracker/#vacc...
20921,United States,USA,2021-05-29,293705050.0,167157043.0,134418748.0,1605272.0,1394832.0,87.82,49.98,40.19,4171.0,"Johnson&Johnson, Moderna, Pfizer/BioNTech",Centers for Disease Control and Prevention,https://covid.cdc.gov/covid-data-tracker/#vacc...


In [112]:
fig = px.line(df_graph,
        y='people_vaccinated',
        x='date',
        color='country',
        title='Personas vacunadas por país')

fig.update_layout(
    font_family="Roboto",
)
fig.show()

In [113]:
fig = px.line(df,
        y='people_fully_vaccinated',
        x=df['date'],
        color='country',
        title='Personas con dosis completas por país')
fig.update_layout(
    font_family="Roboto",
)
fig.show()

In [None]:
df.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,35.0,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,35.0,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,35.0,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,35.0,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...",World Health Organization,https://covid19.who.int/


In [None]:
data = df[[
    'country', 'total_vaccinations', 'people_vaccinated',
    'people_fully_vaccinated', 'vaccines', 'iso_code'
]].groupby(by=['country', 'iso_code', 'vaccines']).sum().reset_index()

In [None]:
data

### Grafico IV - Total de dosis y personas vacunadas, por país.

In [None]:
fig = px.scatter(data,
           x='total_vaccinations',
           y='people_vaccinated',
           color='country',
           size='people_fully_vaccinated',
           title='Total de dosis y personas vacunadas, por país')
fig.update_layout(
    font_family="Roboto",
)
fig.show()

### Gráfico V - Personas vacunadas por país

In [None]:
fig = px.scatter_geo(data,
               locations='iso_code',
               size='people_vaccinated',
               color='vaccines',
               title='Personas vacunadas por país')
fig.update_layout(
    font_family="Roboto",
)
fig.show()

### Gráfico VI - Personas con el proceso de vacunación completo por país, por vacuna

In [None]:
fig = px.scatter_geo(data,
               locations='iso_code',
               size='people_fully_vaccinated',
               color='vaccines',
               title='Personas con el proceso de vacunación completo por país, por vacuna')
fig.update_layout(
    font_family="Roboto",
)
fig.show()

NameError: name 'data' is not defined

## Modulos EDA

###  Pandas Profiling

In [114]:
# Importamos el módulo
from pandas_profiling import ProfileReport

In [117]:
# Generamos el reporte
profile = ProfileReport(df, title="COVID Vaccines")

TypeError: ignored

In [None]:
# Mostramos el reporte
profile.to_widgets()

### Sweetviz 

In [None]:
# Importamos el módulo
!pip install sweetviz
import sweetviz as sv

In [None]:
# Generamos el reporte
my_report = sv.analyze(df)

In [None]:
# El reporte se puede exportar a HTML o previsualizarlo en el notebook:

# my_report.show_html()                # Exporta a HTML
my_report.show_notebook()              # Previasualiza en el notebook

# Exportación 

In [None]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

In [None]:
!jupyter nbconvert --TagRemovePreprocessor.enabled=True --to html $nb_name 

In [None]:
!jupyter nbconvert --TagRemovePreprocessor.enabled=True --TagRemovePreprocessor.remove_cell_tags='{"no"}' --no-input --to html $nb_name 