## Imports

In [2]:
import requests
import pandas as pd
import numpy as np
import pyodbc 
import import_ipynb
import functions as f

importing Jupyter notebook from functions.ipynb


## Extract

In [3]:
f.downloadFile('./Raw/WHO-COVID19-VACCINES.csv', 'https://proxy.hxlstandard.org/data.csv?tagger-match-all=on&tagger-01-header=location&tagger-01-tag=%23country%2Bname&tagger-02-header=iso_code&tagger-02-tag=%23country%2Bcode&tagger-03-header=date&tagger-03-tag=%23date&tagger-04-header=total_vaccinations&tagger-04-tag=%23total%2Bvaccinations&tagger-08-header=daily_vaccinations&tagger-08-tag=%23total%2Bvaccinations%2Bdaily&url=https%3A%2F%2Fraw.githubusercontent.com%2Fowid%2Fcovid-19-data%2Fmaster%2Fpublic%2Fdata%2Fvaccinations%2Fvaccinations.csv&header-row=1&dest=data_view')

In [4]:
# create DataFrame

df = pd.read_csv('./Raw/WHO-COVID19-VACCINES.csv')

In [5]:
# clean column names

df.columns = f.fixColumnNames(df)

In [6]:
# remove first row

df.drop(0, inplace=True)

In [7]:
# change data types

df['Data'] = f.changeDataType(df, 'Date', 'datetime64')
df['Data'] = f.changeDataType(df, 'TotalVaccinations', 'float')
df['Data'] = f.changeDataType(df, 'DailyVaccinations', 'float')

In [8]:
# fill missing values

df.fillna(np.nan, inplace=True)

In [9]:
# drop all rows where 'Location' is 'World' by creating a new DataFrame

df = df.loc[df['Location'] != 'World']

In [10]:
# create 'Key' column

df['Key'] = df['IsoCode'] + df['Date'].astype(str).str.replace('-','')

In [11]:
# check if 'Key' is unique so it can be used as in index

df['Key'].is_unique

True

In [12]:
# set 'Key' as an index and ensure that there are no duplicates

df.set_index('Key', verify_integrity=True, inplace=True)

## Load

In [20]:
# save as parquet to preserve data types

df.to_parquet('./Processed/WHO-COVID19-VACCINES.parquet')