# 2021 COVID-19 Vaccination Progress Analysis



In [16]:
import pandas as pd
from google.colab import files
import io

uploaded = files.upload()
filename = list(uploaded.keys())[0]

# DO NOT set date as index if you want to keep it as a column
df = pd.read_csv(io.BytesIO(uploaded[filename]), parse_dates=['date'])

# Create a string version for Tableau
df['DateStr'] = df['date'].astype(str)

print(df.head())



Saving country_vaccinations.csv to country_vaccinations (4).csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

       country iso_code       date  total_vaccinations  people_vaccinated  \
0  Afghanistan      AFG 2021-02-22                 0.0                0.0   
1  Afghanistan      AFG 2021-02-23                 NaN                NaN   
2  Afghanistan      AFG 2021-02-24                 NaN                NaN   
3  Afghanistan      AFG 2021-02-25                 NaN                NaN   
4  Afghanistan      AFG 2021-02-26                 NaN                NaN   

   people_fully_vaccinated  daily_vaccinations_raw  daily_vaccinations  \
0                      NaN                     NaN                 NaN   
1                      NaN                     NaN              1367.0   
2                      NaN                     NaN              1367.0   
3                      NaN                     NaN              1367.0   
4                      NaN                     NaN              1367.0   

   total_vaccinations_per_hundred  people_vaccinated_per_hundred  \
0                       

In [18]:
# RENAME COLUMNS

df.rename(columns={
    'country': 'Country',
    'total_vaccinations': 'TotalVaccinations',
    'people_vaccinated': 'PeopleVaccinated',
    'people_fully_vaccinated': 'PeopleFullyVaccinated',
    'daily_vaccinations': 'DailyVaccinations',
    'total_vaccinations_per_hundred': 'TotalPerHundred',
    'people_vaccinated_per_hundred': 'PeoplePerHundred',
    'people_fully_vaccinated_per_hundred': 'FullyPerHundred',
    'daily_vaccinations_per_million': 'DailyPerMillion',
    'vaccines': 'Vaccines'
}, inplace=True)

In [19]:
# SUMMARY STATISTICS

print(df.describe())

                                date  TotalVaccinations  PeopleVaccinated  \
count                          25862       1.452200e+04      1.376200e+04   
mean   2021-04-09 20:59:32.438326784       9.139252e+06      4.737802e+06   
min              2020-12-02 00:00:00       0.000000e+00      0.000000e+00   
25%              2021-03-08 00:00:00       1.005620e+05      7.876700e+04   
50%              2021-04-15 00:00:00       7.007560e+05      5.149065e+05   
75%              2021-05-17 00:00:00       3.241681e+06      2.139073e+06   
max              2021-06-20 00:00:00       1.029223e+09      6.220000e+08   
std                              NaN       4.592776e+07      1.797156e+07   

       PeopleFullyVaccinated  daily_vaccinations_raw  DailyVaccinations  \
count           1.104100e+04            1.200300e+04       2.559500e+04   
mean            2.681353e+06            2.012910e+05       1.020430e+05   
min             1.000000e+00            0.000000e+00       0.000000e+00   
25%   

In [20]:
# BAR PLOT OF TOTAL VACCINATIONS PER VACCINE SCHEME

vaccine_summary = df.groupby('Vaccines')[
    ['TotalVaccinations', 'TotalPerHundred', 'DailyVaccinations', 'DailyPerMillion']
].max().reset_index()

import plotly.express as px

fig = px.bar(
    vaccine_summary,
    x='Vaccines',
    y='TotalVaccinations',
    title='Total Vaccinations per Vaccine Scheme',
    text='TotalVaccinations',
    color='Vaccines'
)

# Adjust layout to make it readable
fig.update_layout(
    height=600,                   # taller figure
    xaxis_tickangle=-45,          # rotate x-axis labels
    xaxis_tickfont=dict(size=10), # smaller font for labels
    showlegend=False              # hide legend if color matches x-axis
)
fig.show()


In [21]:
# TREEMAP PER COUNTRY AND VACCINE SCHEME

country_vaccine = df.groupby(['Vaccines', 'Country'])[
    ['TotalVaccinations', 'DailyVaccinations', 'PeopleVaccinated']
].max().reset_index()

fig = px.treemap(
    country_vaccine,
    path=['Vaccines', 'Country'],
    values='TotalVaccinations',
    title="Total vaccinations per country grouped by vaccine scheme"
)

fig.show()



In [22]:
# MAP OF TOTAL VACCINATIONS PER COUNTRY

fig = px.choropleth(country_vaccine, locations='Country', locationmode='country names',
                    color='TotalVaccinations', hover_name='Country',
                    color_continuous_scale='Viridis', title='Total Vaccinations per Country')
fig.show()

In [23]:
# TIME-SERIES PLOT FOR SELECTED COUNTRIES

countries = ['United States', 'China', 'United Kingdom', 'Germany', 'France']
for country in countries:
    df_country = df[df['Country'] == country]
    fig = px.line(df_country, x=df_country.index, y='TotalVaccinations', title=f'Total Vaccinations in {country}')
    fig.show()


In [24]:
print(df.columns)


Index(['Country', 'iso_code', 'date', 'TotalVaccinations', 'PeopleVaccinated',
       'PeopleFullyVaccinated', 'daily_vaccinations_raw', 'DailyVaccinations',
       'TotalPerHundred', 'PeoplePerHundred', 'FullyPerHundred',
       'DailyPerMillion', 'Vaccines', 'source_name', 'source_website',
       'DateStr'],
      dtype='object')


In [25]:
# Save CSV for Tableau
df.to_csv('country_vaccinations_for_tableau.csv', index=False)
files.download('country_vaccinations_for_tableau.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>