In [178]:
#import libaries
import pandas as pd
import numpy as np
import geopandas
import pycountry
from geopy import Nominatim
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


### Data Loading and Preprocessing

In [179]:
#import vaccine data into dataframe
df = pd.read_csv('VaccineData.csv')

In [180]:
#change Date col to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [181]:
#review dataframe info (data types, nulls, etc)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48150 entries, 0 to 48149
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Country                   48150 non-null  object        
 1   Date                      48150 non-null  datetime64[ns]
 2   Vaccine_Manufacturer      48150 non-null  object        
 3   Total_Vaccinations        48150 non-null  int64         
 4   Severe Disease Ancestral  48150 non-null  int64         
 5   Infection Ancestral       48150 non-null  int64         
 6   Severe Disease Alpha      48150 non-null  int64         
 7   Infection Alpha           48150 non-null  int64         
 8   Severe Disease Beta       48150 non-null  int64         
 9   Infection Beta            48150 non-null  int64         
 10  Severe Disease Gamma      48150 non-null  int64         
 11  Infection Gamma           48150 non-null  int64         
 12  Severe Disease Del

In [182]:
#drop European Union rows, since they are unneeded for this analysis
df.drop(df.loc[df['Country'] == 'European Union'].index, inplace=True, axis=0)

In [183]:
#drop columns that are not the alpha, delta, or omicron
df.drop(df.columns[[4,5,8,9,10,11]], axis=1, inplace = True)

In [184]:
#find number of unique vaccines giving by vaccine manufacturer
df.groupby('Country')['Vaccine_Manufacturer'].nunique()

Country
Argentina        6
Austria          6
Belgium          5
Bulgaria         4
Canada           6
Chile            5
Croatia          5
Cyprus           5
Czechia          8
Denmark          4
Ecuador          4
Estonia          5
Finland          5
France           5
Germany          6
Hong Kong        2
Hungary          6
Iceland          4
Ireland          5
Italy            5
Japan            4
Latvia           7
Liechtenstein    4
Lithuania        4
Luxembourg       5
Malta            4
Nepal            5
Netherlands      5
Norway           4
Peru             4
Poland           5
Portugal         8
Romania          4
Slovakia         6
Slovenia         5
South Africa     2
South Korea      6
Spain            4
Sweden           4
Switzerland      4
Ukraine          5
United States    3
Uruguay          3
Name: Vaccine_Manufacturer, dtype: int64

In [185]:
'''
#function for finding country codes
def countrycode(column):
    CODE = []
    for country in column:
        try:
            code=pycountry.countries.get(name=country).alpha_3
            CODE.append(code)
        except:
            CODE.append('None')
    return CODE
'''

"\n#function for finding country codes\ndef countrycode(column):\n    CODE = []\n    for country in column:\n        try:\n            code=pycountry.countries.get(name=country).alpha_3\n            CODE.append(code)\n        except:\n            CODE.append('None')\n    return CODE\n"

In [186]:
'''
#create nominatim object to obtain lat long of country
geolocator = Nominatim(user_agent = 'DSEI270_Proj1')

#function to get lat and long from country name
def latlong(column):
    loclist = []
    for country in column:
        try:
            loc = geolocator.geocode(country)
            loclist.append([country, loc.latitude, loc.longitude])
        except:
            loclist.append(['None','None','None'])
    return pd.DataFrame(loclist, columns=['code','lat','long'])
'''

"\n#create nominatim object to obtain lat long of country\ngeolocator = Nominatim(user_agent = 'DSEI270_Proj1')\n\n#function to get lat and long from country name\ndef latlong(column):\n    loclist = []\n    for country in column:\n        try:\n            loc = geolocator.geocode(country)\n            loclist.append([country, loc.latitude, loc.longitude])\n        except:\n            loclist.append(['None','None','None'])\n    return pd.DataFrame(loclist, columns=['code','lat','long'])\n"

In [187]:
'''
#create code column of 3 letter code for each country; used to merge with geopandas dataset
df['code'] = countrycode(df['Country'])
'''

"\n#create code column of 3 letter code for each country; used to merge with geopandas dataset\ndf['code'] = countrycode(df['Country'])\n"

In [188]:
'''
#import world dataset from geopandas, rename code column, and drop unneeded columns
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.columns = ['pop_est', 'continent', 'name', 'code', 'gdp_md_est', 'geometry']
world = world[['continent','code','geometry']]
'''

"\n#import world dataset from geopandas, rename code column, and drop unneeded columns\nworld = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))\nworld.columns = ['pop_est', 'continent', 'name', 'code', 'gdp_md_est', 'geometry']\nworld = world[['continent','code','geometry']]\n"

In [189]:
'''
#create dataframe of lat and long info for each unique country
latlongdf = latlong(df['code'].unique())
'''

"\n#create dataframe of lat and long info for each unique country\nlatlongdf = latlong(df['code'].unique())\n"

In [190]:
'''
#merge geometry and lat/long dataframes to df
df = pd.merge(df, world, on='code')
df = pd.merge(df, latlongdf, on='code')
'''

"\n#merge geometry and lat/long dataframes to df\ndf = pd.merge(df, world, on='code')\ndf = pd.merge(df, latlongdf, on='code')\n"

In [191]:
'''
#create geopandas dataframe
gdf = geopandas.GeoDataFrame(df, geometry=df['geometry'])
'''

"\n#create geopandas dataframe\ngdf = geopandas.GeoDataFrame(df, geometry=df['geometry'])\n"

In [192]:
#load efficacy data 
df_eff = pd.read_csv('Vaccine_Efficacy.csv')
df_eff.drop(df_eff.columns[[1,2,5,6,7,8]], axis=1, inplace = True)

In [193]:
#merge vaccine data and efficacy
df = pd.merge(df,df_eff,on='Vaccine_Manufacturer',how='left')
df.head()

Unnamed: 0,Country,Date,Vaccine_Manufacturer,Total_Vaccinations,Severe Disease Alpha,Infection Alpha,Severe Disease Delta,Infection Delta,Severe Disease Omicron,Infection Omicron,Eff Severe Disease Alpha,Eff Infection Alpha,Eff Severe Disease delta,Eff Infection Delta,Eff Severe Disease Omicron,Eff Infection Omicron
0,Argentina,2020-12-29,Oxford/AstraZeneca,1,1,1,1,1,1,0,94,63,94,69,71,36
1,Argentina,2020-12-29,Sinopharm/Beijing,1,1,1,1,1,1,0,73,68,71,67,53,35
2,Argentina,2020-12-29,Sputnik V,20488,18849,17620,18234,17415,11268,7376,92,86,89,85,67,44
3,Argentina,2020-12-30,Sputnik V,40590,37343,34907,36125,34502,22325,14612,92,86,89,85,67,44
4,Argentina,2020-12-31,Sputnik V,43396,39924,37321,38622,36887,23868,15623,92,86,89,85,67,44


In [194]:
#Find latest date for each country/manuf pair
df_latest = df.loc[df.groupby(['Country','Vaccine_Manufacturer']).Date.idxmax()]

In [195]:
'''
#added column for total vaccine of all manuf for specific date and country for proportion calculations
total_vacc = df.groupby(['Country','Date']).sum()[['Total_Vaccinations']].rename(columns={'Total_Vaccinations':'Total'})
df = pd.merge(df, total_vacc, how='left', on=['Country','Date'])
'''

"\n#added column for total vaccine of all manuf for specific date and country for proportion calculations\ntotal_vacc = df.groupby(['Country','Date']).sum()[['Total_Vaccinations']].rename(columns={'Total_Vaccinations':'Total'})\ndf = pd.merge(df, total_vacc, how='left', on=['Country','Date'])\n"

In [196]:
'''
#expanded dataframe of all dates and manuf 
mylist = ['Oxford/AstraZeneca','Sinopharm/Beijing','Sputnik V','Pfizer/BioNTech','CanSino','Moderna','Johnson&Johnson','Novavax','Valneva','Medicago','Sinovac','Covaxin']
df_expanded = pd.DataFrame({'Date':pd.date_range(start='12/4/2020', end='10/18/2022')})
df_expanded['Vaccine_Manufacturer'] = [mylist] * len(df_expanded)
df_expanded = df_expanded.explode('Vaccine_Manufacturer')
df_expanded = pd.merge(df_expanded,df,how='left',on=['Date','Vaccine_Manufacturer'])
df_expanded = df_expanded.drop(columns=['geometry'])
'''

"\n#expanded dataframe of all dates and manuf \nmylist = ['Oxford/AstraZeneca','Sinopharm/Beijing','Sputnik V','Pfizer/BioNTech','CanSino','Moderna','Johnson&Johnson','Novavax','Valneva','Medicago','Sinovac','Covaxin']\ndf_expanded = pd.DataFrame({'Date':pd.date_range(start='12/4/2020', end='10/18/2022')})\ndf_expanded['Vaccine_Manufacturer'] = [mylist] * len(df_expanded)\ndf_expanded = df_expanded.explode('Vaccine_Manufacturer')\ndf_expanded = pd.merge(df_expanded,df,how='left',on=['Date','Vaccine_Manufacturer'])\ndf_expanded = df_expanded.drop(columns=['geometry'])\n"

In [197]:
#Really bad function to fill in missing data for time series
def reallybadcode(test):
    dataframes =[]
    for country in test['Country'].unique():
        df = test[test['Country']==country]
        manuflist = list(df['Vaccine_Manufacturer'].unique())
        for manuf in manuflist:
            df_sm = df[df['Vaccine_Manufacturer']==manuf]
            df_expanded = pd.DataFrame({'Date':pd.date_range(start=df_sm.loc[df_sm.Date.idxmin()]['Date'], end=df.loc[df.Date.idxmax()]['Date'])})
            df_expanded = pd.merge(df_expanded,df_sm,how='left',on=['Date'])
            df_expanded = df_expanded.ffill()
            dataframes.append(df_expanded)
    return pd.concat(dataframes)


In [198]:
#expanded dataframe with forward filled data for all dates in timeseries 
df_expanded = reallybadcode(df)

#added column for total vaccine of all manuf for specific date and country for proportion calculations
total_vacc = df_expanded.groupby(['Country','Date']).sum()[['Total_Vaccinations']].rename(columns={'Total_Vaccinations':'Total'})

#merge total vaccination per day calculation to df 
df = pd.merge(df,total_vacc,on=['Country','Date'],how='left')

#df_expanded = pd.merge(df_expanded,total_vacc,how='left',on=['Country','Date']).sort_values(by=['Date'])

In [199]:
#merge number vaccined (at least one dose) per 100 people to dataframe
df_proportions = pd.read_csv('share-people-vaccinated-covid.csv')
df_proportions.drop(['Code','145610-annotations'],inplace=True,axis=1)
df_proportions.columns = ['Country','Date','overall vacc perc']
df_proportions['Date'] = pd.to_datetime(df_proportions['Date'])
df_proportions['overall vacc perc'] = df_proportions['overall vacc perc'] 
df = pd.merge(df,df_proportions,on=['Country','Date'],how='left')

### Percentage of Vaccinations Not Offering Protection and Breakthrough Infection Calculation

In [200]:
# percent of specific manuf vaccine out of the total vaccinations administered
df['perc of manuf vacc'] = df['Total_Vaccinations'] / df['Total'] * 100

# percent of the specific manuf vaccine that offers protection against infection based on efficacy 
df['perc infection protected alpha'] = df['perc of manuf vacc'] / 100 * df['Eff Infection Alpha']
df['perc infection protected delta'] = df['perc of manuf vacc'] / 100 * df['Eff Infection Delta']
df['perc infection protected omicron'] = df['perc of manuf vacc'] / 100 * df['Eff Infection Omicron']

# percent of the specific manuf vaccine that does not offer protection against infection based on efficacy 
df['perc infection unprotected alpha'] = 100 - df['perc infection protected alpha']
df['perc infection unprotected delta'] = 100 - df['perc infection protected delta']
df['perc infection unprotected omicron'] = 100 - df['perc infection protected omicron']

# breakthrough infection = percent of population that is succeptible to breakout infection based on vaccinations administered and efficacy rates
df['breakthrough alpha'] = (100 - df['perc infection protected alpha']) * df['overall vacc perc'] / 100
df['breakthrough delta'] = (100 - df['perc infection protected delta']) * df['overall vacc perc'] / 100
df['breakthrough omicron'] = (100 - df['perc infection protected omicron']) * df['overall vacc perc'] / 100

In [201]:
df

Unnamed: 0,Country,Date,Vaccine_Manufacturer,Total_Vaccinations,Severe Disease Alpha,Infection Alpha,Severe Disease Delta,Infection Delta,Severe Disease Omicron,Infection Omicron,...,perc of manuf vacc,perc infection protected alpha,perc infection protected delta,perc infection protected omicron,perc infection unprotected alpha,perc infection unprotected delta,perc infection unprotected omicron,breakthrough alpha,breakthrough delta,breakthrough omicron
0,Argentina,2020-12-29,Oxford/AstraZeneca,1,1,1,1,1,1,0,...,0.004880,0.003075,0.003367,0.001757,99.996925,99.996633,99.998243,0.049998,0.049998,0.049999
1,Argentina,2020-12-29,Sinopharm/Beijing,1,1,1,1,1,1,0,...,0.004880,0.003319,0.003270,0.001708,99.996681,99.996730,99.998292,0.049998,0.049998,0.049999
2,Argentina,2020-12-29,Sputnik V,20488,18849,17620,18234,17415,11268,7376,...,99.990239,85.991606,84.991703,43.995705,14.008394,15.008297,56.004295,0.007004,0.007504,0.028002
3,Argentina,2020-12-30,Sputnik V,40590,37343,34907,36125,34502,22325,14612,...,99.995073,85.995763,84.995812,43.997832,14.004237,15.004188,56.002168,0.012604,0.013504,0.050402
4,Argentina,2020-12-31,Sputnik V,43396,39924,37321,38622,36887,23868,15623,...,99.995391,85.996037,84.996083,43.997972,14.003963,15.003917,56.002028,0.014004,0.015004,0.056002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41545,Uruguay,2022-10-12,Pfizer/BioNTech,2556144,2428337,2198284,2428337,2147161,1840424,1124703,...,43.357941,37.287829,36.420670,19.077494,62.712171,63.579330,80.922506,54.998574,55.759072,70.969038
41546,Uruguay,2022-10-13,Pfizer/BioNTech,2556166,2428358,2198303,2428358,2147179,1840440,1124713,...,43.358152,37.288011,36.420848,19.077587,62.711989,63.579152,80.922413,54.998414,55.758916,70.968956
41547,Uruguay,2022-10-14,Pfizer/BioNTech,2556194,2428384,2198327,2428384,2147203,1840460,1124725,...,43.358421,37.288242,36.421074,19.077705,62.711758,63.578926,80.922295,54.998211,55.758718,70.968852
41548,Uruguay,2022-10-15,Pfizer/BioNTech,2556197,2428387,2198329,2428387,2147205,1840462,1124727,...,43.358450,37.288267,36.421098,19.077718,62.711733,63.578902,80.922282,54.998190,55.758697,70.968841


### Line Area Graph for breakout infection 

In [146]:
px.line(df[df['Country']=='Estonia'].sort_values(['Vaccine_Manufacturer','Date']),x="Date",y="Total_Vaccinations",color='Vaccine_Manufacturer')

In [147]:
px.line(df, x='Date',y='Total',color='Country' )