In [1]:
#import libaries
import pandas as pd
import numpy as np
import geopandas
import pycountry
from geopy import Nominatim
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


ModuleNotFoundError: No module named 'geopandas'

### Data Loading and Preprocessing

In [None]:
#import vaccine data into dataframe
df = pd.read_csv('VaccineData.csv')

In [None]:
#change Date col to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
#review dataframe info (data types, nulls, etc)
df.info()

In [None]:
#drop European Union rows, since they are unneeded for this analysis
df.drop(df.loc[df['Country'] == 'European Union'].index, inplace=True, axis=0)

In [None]:
#drop columns that are not the alpha, delta, or omicron
df.drop(df.columns[[4,5,8,9,10,11]], axis=1, inplace = True)

In [None]:
#find number of unique vaccines giving by vaccine manufacturer
df.groupby('Country')['Vaccine_Manufacturer'].nunique()

In [None]:
'''
#function for finding country codes
def countrycode(column):
    CODE = []
    for country in column:
        try:
            code=pycountry.countries.get(name=country).alpha_3
            CODE.append(code)
        except:
            CODE.append('None')
    return CODE
'''

In [None]:
'''
#create nominatim object to obtain lat long of country
geolocator = Nominatim(user_agent = 'DSEI270_Proj1')

#function to get lat and long from country name
def latlong(column):
    loclist = []
    for country in column:
        try:
            loc = geolocator.geocode(country)
            loclist.append([country, loc.latitude, loc.longitude])
        except:
            loclist.append(['None','None','None'])
    return pd.DataFrame(loclist, columns=['code','lat','long'])
'''

In [None]:
'''
#create code column of 3 letter code for each country; used to merge with geopandas dataset
df['code'] = countrycode(df['Country'])
'''

In [None]:
'''
#import world dataset from geopandas, rename code column, and drop unneeded columns
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.columns = ['pop_est', 'continent', 'name', 'code', 'gdp_md_est', 'geometry']
world = world[['continent','code','geometry']]
'''

In [None]:
'''
#create dataframe of lat and long info for each unique country
latlongdf = latlong(df['code'].unique())
'''

In [None]:
'''
#merge geometry and lat/long dataframes to df
df = pd.merge(df, world, on='code')
df = pd.merge(df, latlongdf, on='code')
'''

In [None]:
'''
#create geopandas dataframe
gdf = geopandas.GeoDataFrame(df, geometry=df['geometry'])
'''

In [None]:
#load efficacy data 
df_eff = pd.read_csv('Vaccine_Efficacy.csv')
df_eff.drop(df_eff.columns[[1,2,5,6,7,8]], axis=1, inplace = True)

In [None]:
#merge vaccine data and efficacy
df = pd.merge(df,df_eff,on='Vaccine_Manufacturer',how='left')
df.head()

In [None]:
#Find latest date for each country/manuf pair
df_latest = df.loc[df.groupby(['Country','Vaccine_Manufacturer']).Date.idxmax()]

In [None]:
'''
#added column for total vaccine of all manuf for specific date and country for proportion calculations
total_vacc = df.groupby(['Country','Date']).sum()[['Total_Vaccinations']].rename(columns={'Total_Vaccinations':'Total'})
df = pd.merge(df, total_vacc, how='left', on=['Country','Date'])
'''

In [None]:
'''
#expanded dataframe of all dates and manuf 
mylist = ['Oxford/AstraZeneca','Sinopharm/Beijing','Sputnik V','Pfizer/BioNTech','CanSino','Moderna','Johnson&Johnson','Novavax','Valneva','Medicago','Sinovac','Covaxin']
df_expanded = pd.DataFrame({'Date':pd.date_range(start='12/4/2020', end='10/18/2022')})
df_expanded['Vaccine_Manufacturer'] = [mylist] * len(df_expanded)
df_expanded = df_expanded.explode('Vaccine_Manufacturer')
df_expanded = pd.merge(df_expanded,df,how='left',on=['Date','Vaccine_Manufacturer'])
df_expanded = df_expanded.drop(columns=['geometry'])
'''

In [None]:
#Really bad function to fill in missing data for time series
def reallybadcode(test):
    dataframes =[]
    for country in test['Country'].unique():
        df = test[test['Country']==country]
        manuflist = list(df['Vaccine_Manufacturer'].unique())
        for manuf in manuflist:
            df_sm = df[df['Vaccine_Manufacturer']==manuf]
            df_expanded = pd.DataFrame({'Date':pd.date_range(start=df_sm.loc[df_sm.Date.idxmin()]['Date'], end=df.loc[df.Date.idxmax()]['Date'])})
            df_expanded = pd.merge(df_expanded,df_sm,how='left',on=['Date'])
            df_expanded = df_expanded.ffill()
            dataframes.append(df_expanded)
    return pd.concat(dataframes)


In [None]:
#expanded dataframe with forward filled data for all dates in timeseries 
df_expanded = reallybadcode(df)

#added column for total vaccine of all manuf for specific date and country for proportion calculations
total_vacc = df_expanded.groupby(['Country','Date']).sum()[['Total_Vaccinations']].rename(columns={'Total_Vaccinations':'Total'})

#merge total vaccination per day calculation to df 
df = pd.merge(df,total_vacc,on=['Country','Date'],how='left')

#df_expanded = pd.merge(df_expanded,total_vacc,how='left',on=['Country','Date']).sort_values(by=['Date'])

In [None]:
#merge number vaccined (at least one dose) per 100 people to dataframe
df_proportions = pd.read_csv('share-people-vaccinated-covid.csv')
df_proportions.drop(['Code','145610-annotations'],inplace=True,axis=1)
df_proportions.columns = ['Country','Date','overall vacc perc']
df_proportions['Date'] = pd.to_datetime(df_proportions['Date'])
df_proportions['overall vacc perc'] = df_proportions['overall vacc perc'] 
df = pd.merge(df,df_proportions,on=['Country','Date'],how='left')

### Percentage of Vaccinations Not Offering Protection and Breakthrough Infection Calculation

In [None]:
# percent of specific manuf vaccine out of the total vaccinations administered
df['perc of manuf vacc'] = df['Total_Vaccinations'] / df['Total'] * 100

# percent of the specific manuf vaccine that offers protection against infection based on efficacy 
df['perc infection protected alpha'] = df['perc of manuf vacc'] / 100 * df['Eff Infection Alpha']
df['perc infection protected delta'] = df['perc of manuf vacc'] / 100 * df['Eff Infection Delta']
df['perc infection protected omicron'] = df['perc of manuf vacc'] / 100 * df['Eff Infection Omicron']

# percent of the specific manuf vaccine that does not offer protection against infection based on efficacy 
df['perc infection unprotected alpha'] = 100 - df['perc infection protected alpha']
df['perc infection unprotected delta'] = 100 - df['perc infection protected delta']
df['perc infection unprotected omicron'] = 100 - df['perc infection protected omicron']

# breakthrough infection = percent of population that is succeptible to breakout infection based on vaccinations administered and efficacy rates
df['breakthrough alpha'] = (100 - df['perc infection protected alpha']) * df['overall vacc perc'] / 100
df['breakthrough delta'] = (100 - df['perc infection protected delta']) * df['overall vacc perc'] / 100
df['breakthrough omicron'] = (100 - df['perc infection protected omicron']) * df['overall vacc perc'] / 100

In [None]:
df

### Line Area Graph for breakout infection 

In [None]:
px.line(df[df['Country']=='Estonia'].sort_values(['Vaccine_Manufacturer','Date']),x="Date",y="Total_Vaccinations",color='Vaccine_Manufacturer')

In [None]:
px.line(df, x='Date',y='Total',color='Country' )