## Filtering and Merging 2021VAERS Datasets & Filtering Global Vaccination Dataset

In [1]:
# Import dependencies
import pandas as pd

from sqlalchemy import create_engine

### 2021 VAERS Datasets-Data Filtering and Cleaning

#### 2021VAERSVAX.csv: Filter the data based on vaccination type = COVID19,  and remove unneeded columns

In [2]:
# Import data file
VAERSvax = "dataUsed/2021VAERSVAX.csv"

# Read data file, need the encoding to read properly
VAERSvax_raw = pd.read_csv(VAERSvax, encoding="ISO-8859-1")
display(VAERSvax_raw.head(5))
display(VAERSvax_raw.tail(5))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,916710,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,916741,COVID19,PFIZER\BIONTECH,EH9899,1,SYR,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
2,916742,COVID19,PFIZER\BIONTECH,,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH))
3,916746,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
4,916772,COVID19,PFIZER\BIONTECH,EJ1685,UNK,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
3009,983720,COVID19,MODERNA,039K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
3010,983721,COVID19,MODERNA,039K20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
3011,983766,COVID19,MODERNA,013L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
3012,983919,COVID19,MODERNA,030L20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
3013,985205,COVID19,MODERNA,029L20A,1,IM,UN,COVID19 (COVID19 (MODERNA))


In [3]:
# Extract the columns names
columns = list(VAERSvax_raw.columns)
columns

['VAERS_ID',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [4]:
# Create a dataframe with the wanted columns only
wanted_columns = VAERSvax_raw[['VAERS_ID',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_DOSE_SERIES',
 'VAX_SITE'
]]
display(wanted_columns.head(5))
display(wanted_columns.tail(5))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
0,916710,COVID19,MODERNA,1,LA
1,916741,COVID19,PFIZER\BIONTECH,1,LA
2,916742,COVID19,PFIZER\BIONTECH,1,
3,916746,COVID19,MODERNA,1,LA
4,916772,COVID19,PFIZER\BIONTECH,UNK,LA


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
3009,983720,COVID19,MODERNA,1,LA
3010,983721,COVID19,MODERNA,1,RA
3011,983766,COVID19,MODERNA,1,RA
3012,983919,COVID19,MODERNA,1,LA
3013,985205,COVID19,MODERNA,1,UN


In [5]:
# Filter the data by vax_type = COVID19
# Reset index
filtered_data = wanted_columns[wanted_columns["VAX_TYPE"]=="COVID19"]
filtered_data.reset_index(drop=True, inplace=True)
display(filtered_data.head(5))
display(filtered_data.tail(5))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
0,916710,COVID19,MODERNA,1,LA
1,916741,COVID19,PFIZER\BIONTECH,1,LA
2,916742,COVID19,PFIZER\BIONTECH,1,
3,916746,COVID19,MODERNA,1,LA
4,916772,COVID19,PFIZER\BIONTECH,UNK,LA


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
2839,983720,COVID19,MODERNA,1,LA
2840,983721,COVID19,MODERNA,1,RA
2841,983766,COVID19,MODERNA,1,RA
2842,983919,COVID19,MODERNA,1,LA
2843,985205,COVID19,MODERNA,1,UN


#### 2021VAERSData.csv

In [6]:
# Import data file
VAERSData = "dataUsed/2021VAERSData.csv"

# Read data file, need the encoding to read properly
vaer_data_raw = pd.read_csv(VAERSData, encoding="ISO-8859-1")
display(vaer_data_raw.head(2))
display(vaer_data_raw.tail(2))

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
0,916710,01/01/2021,MO,23.0,23.0,,F,,"Acute appendicitis, onset morning of 1/1/2021 ...",,...,,Hypothyroidism,,,2,01/01/2021,,,Y,NKDA
1,916741,01/01/2021,AR,68.0,68.0,,F,,"on dec 22 I felt some myalgias, chills, fatigu...",,...,had surgery R hand for advanced arthritis 11/1...,Rheumatoid arthritis - mostly affecting R wris...,,,2,01/01/2021,,,,bee stings


Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
2944,983919,01/28/2021,FL,69.0,69.0,,M,,death,Y,...,,,,,2,01/28/2021,,,,
2945,985205,01/29/2021,OH,75.0,75.0,,M,,Patient was feeling dizzy and under the weathe...,Y,...,No,,,,2,01/29/2021,,,,No


In [7]:
# Extract the columns names
columns2 = list(vaer_data_raw.columns)
columns2

['VAERS_ID',
 'RECVDATE',
 'STATE',
 'AGE_YRS',
 'CAGE_YR',
 'CAGE_MO',
 'SEX',
 'RPT_DATE',
 'SYMPTOM_TEXT',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'HOSPDAYS',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'NUMDAYS',
 'LAB_DATA',
 'V_ADMINBY',
 'V_FUNDBY',
 'OTHER_MEDS',
 'CUR_ILL',
 'HISTORY',
 'PRIOR_VAX',
 'SPLTTYPE',
 'FORM_VERS',
 'TODAYS_DATE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'ALLERGIES']

In [8]:
# Create a dataframe with the wanted columns only
wanted_columns2 = vaer_data_raw[['VAERS_ID',
 'STATE',
 'AGE_YRS',
 'SEX',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'HOSPDAYS',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'NUMDAYS'
]]
wanted_columns2.head(5)

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,DATEDIED,L_THREAT,ER_VISIT,HOSPITAL,HOSPDAYS,X_STAY,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS
0,916710,MO,23.0,F,,,Y,,Y,,,,U,12/29/2020,01/01/2021,3.0
1,916741,AR,68.0,F,,,,,,,,Y,N,12/21/2020,12/22/2020,1.0
2,916742,MN,29.0,F,,,Y,,Y,4.0,,,U,12/29/2020,12/29/2020,0.0
3,916746,TX,49.0,F,,,Y,,,,,,Y,12/28/2020,12/28/2020,0.0
4,916772,GA,55.0,M,,,,,,,,,U,12/22/2020,12/26/2020,4.0


#### 2021VAERSSymptoms.csv

In [9]:
# Import data file
VAERSsymptoms = "dataUsed/2021VAERSSYMPTOMS.csv"

# Read data file, need the encoding to read properly
vaer_symptoms_raw = pd.read_csv(VAERSsymptoms, encoding="ISO-8859-1")
display(vaer_symptoms_raw.head(2))
display(vaer_symptoms_raw.tail(2))

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOMVERSION1,SYMPTOM2,SYMPTOMVERSION2,SYMPTOM3,SYMPTOMVERSION3,SYMPTOM4,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5
0,916710,Appendicitis,23.1,Band neutrophil percentage increased,23.1,Surgery,23.1,White blood cell count increased,23.1,,
1,916741,Chills,23.1,Complex regional pain syndrome,23.1,Fatigue,23.1,Headache,23.1,Joint range of motion decreased,23.1


Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOMVERSION1,SYMPTOM2,SYMPTOMVERSION2,SYMPTOM3,SYMPTOMVERSION3,SYMPTOM4,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5
4710,983919,Death,23.1,,,,,,,,
4711,985205,Death,23.1,Dizziness,23.1,Malaise,23.1,,,,


In [10]:
# Extract the columns names
columns3 = list(vaer_symptoms_raw.columns)
columns3

['VAERS_ID',
 'SYMPTOM1',
 'SYMPTOMVERSION1',
 'SYMPTOM2',
 'SYMPTOMVERSION2',
 'SYMPTOM3',
 'SYMPTOMVERSION3',
 'SYMPTOM4',
 'SYMPTOMVERSION4',
 'SYMPTOM5',
 'SYMPTOMVERSION5']

In [11]:
# Create a dataframe with the wanted columns only
wanted_columns3 = vaer_symptoms_raw[['VAERS_ID',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5'
]]
wanted_columns3.tail(5)

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
4707,983721,Death,,,,
4708,983766,Blood pH decreased,Cardiac failure acute,Chest discomfort,Death,Dyspnoea
4709,983766,International normalised ratio increased,N-terminal prohormone brain natriuretic peptid...,SARS-CoV-2 test negative,,
4710,983919,Death,,,,
4711,985205,Death,Dizziness,Malaise,,


### 2021 VAERS Datasets-Merging

In [12]:
# Merge the two organized dataframes
merge_2data = pd.merge(filtered_data,wanted_columns2, on="VAERS_ID", how="left")
display(merge_2data.head(2))
display(merge_2data.tail(2))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,L_THREAT,ER_VISIT,HOSPITAL,HOSPDAYS,X_STAY,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS
0,916710,COVID19,MODERNA,1,LA,MO,23.0,F,,,Y,,Y,,,,U,12/29/2020,01/01/2021,3.0
1,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,,,,,,Y,N,12/21/2020,12/22/2020,1.0


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,L_THREAT,ER_VISIT,HOSPITAL,HOSPDAYS,X_STAY,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS
2842,983919,COVID19,MODERNA,1,LA,FL,69.0,M,Y,,,,,,,,,01/21/2021,01/01/2021,
2843,985205,COVID19,MODERNA,1,UN,OH,75.0,M,Y,01/26/2021,,,,,,,N,01/25/2021,01/26/2021,1.0


In [13]:
# Merge all three organized dataframes
merge_all = pd.merge(merge_2data,wanted_columns3,on="VAERS_ID", how="left")
display(merge_all.head(4))
display(merge_all.tail(4))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,...,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,916710,COVID19,MODERNA,1,LA,MO,23.0,F,,,...,,U,12/29/2020,01/01/2021,3.0,Appendicitis,Band neutrophil percentage increased,Surgery,White blood cell count increased,
1,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,...,Y,N,12/21/2020,12/22/2020,1.0,Chills,Complex regional pain syndrome,Fatigue,Headache,Joint range of motion decreased
2,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,...,Y,N,12/21/2020,12/22/2020,1.0,Myalgia,Pain in extremity,Peripheral swelling,X-ray abnormal,
3,916742,COVID19,PFIZER\BIONTECH,1,,MN,29.0,F,,,...,,U,12/29/2020,12/29/2020,0.0,Anaphylactic reaction,Blood test,Burning sensation,Central venous catheterisation,Dysphonia


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,...,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
4563,983766,COVID19,MODERNA,1,RA,NC,90.0,F,Y,01/24/2021,...,,N,01/16/2021,01/21/2021,5.0,Blood pH decreased,Cardiac failure acute,Chest discomfort,Death,Dyspnoea
4564,983766,COVID19,MODERNA,1,RA,NC,90.0,F,Y,01/24/2021,...,,N,01/16/2021,01/21/2021,5.0,International normalised ratio increased,N-terminal prohormone brain natriuretic peptid...,SARS-CoV-2 test negative,,
4565,983919,COVID19,MODERNA,1,LA,FL,69.0,M,Y,,...,,,01/21/2021,01/01/2021,,Death,,,,
4566,985205,COVID19,MODERNA,1,UN,OH,75.0,M,Y,01/26/2021,...,,N,01/25/2021,01/26/2021,1.0,Death,Dizziness,Malaise,,


### World Wide Dataset-Data Cleaning

In [2]:
# Import data file
worldWideData = "dataUsed/WorldWideData.csv"

# Read data file, need the encoding to read properly
world_raw = pd.read_csv(worldWideData, encoding="ISO-8859-1")
display(world_raw.head(2))
display(world_raw.tail(2))

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
67893,ZWE,Africa,Zimbabwe,2021-02-08,34658.0,106.0,158.571,1339.0,13.0,15.0,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
67894,ZWE,Africa,Zimbabwe,2021-02-09,34781.0,123.0,138.143,1353.0,14.0,14.143,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571


In [3]:
# Extract the columns names
columns4 = list(world_raw.columns)
columns4

['iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'reproduction_rate',
 'icu_patients',
 'icu_patients_per_million',
 'hosp_patients',
 'hosp_patients_per_million',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'new_tests',
 'total_tests',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'positive_rate',
 'tests_per_case',
 'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_hundred',
 'people_fully_vaccinate

In [4]:
# Create a dataframe with the wanted columns only
wanted_columns4 = world_raw[['iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases' , 
 'total_deaths',
 'new_deaths',
 'total_tests',
 'positive_rate',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'population',
 'median_age',
 'aged_65_older',
 'aged_70_older'
]]
display(wanted_columns4.head(4))
display(wanted_columns4.tail(4))

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population,median_age,aged_65_older,aged_70_older
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,,,,,38928341.0,18.6,2.581,1.337
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population,median_age,aged_65_older,aged_70_older
67891,ZWE,Africa,Zimbabwe,2021-02-06,34487.0,156.0,1316.0,13.0,335489.0,0.064,,,,,14862927.0,19.6,2.822,1.882
67892,ZWE,Africa,Zimbabwe,2021-02-07,34552.0,65.0,1326.0,10.0,336553.0,0.064,,,,,14862927.0,19.6,2.822,1.882
67893,ZWE,Africa,Zimbabwe,2021-02-08,34658.0,106.0,1339.0,13.0,,,,,,,14862927.0,19.6,2.822,1.882
67894,ZWE,Africa,Zimbabwe,2021-02-09,34781.0,123.0,1353.0,14.0,,,,,,,14862927.0,19.6,2.822,1.882


In [5]:
latlng = "dataUsed/world_country_and_usa_states_latitude_and_longitude_values.csv"

latlng_raw = pd.read_csv(latlng)
latlng_raw.head()

Unnamed: 0,country_code,latitude,longitude,country,usa_state_code,usa_state_latitude,usa_state_longitude,usa_state
0,AD,42.546245,1.601554,Andorra,AK,63.588753,-154.493062,Alaska
1,AE,23.424076,53.847818,United Arab Emirates,AL,32.318231,-86.902298,Alabama
2,AF,33.93911,67.709953,Afghanistan,AR,35.20105,-91.831833,Arkansas
3,AG,17.060816,-61.796428,Antigua and Barbuda,AZ,34.048928,-111.093731,Arizona
4,AI,18.220554,-63.068615,Anguilla,CA,36.778261,-119.417932,California


In [6]:
latlng_world = latlng_raw[[
    "latitude",
    "longitude",
    "country"
]]

latlng_world = latlng_world.rename(columns = {"country":"location"})
latlng_world.head()

Unnamed: 0,latitude,longitude,location
0,42.546245,1.601554,Andorra
1,23.424076,53.847818,United Arab Emirates
2,33.93911,67.709953,Afghanistan
3,17.060816,-61.796428,Antigua and Barbuda
4,18.220554,-63.068615,Anguilla


In [15]:
merged_db = pd.merge(wanted_columns4, latlng_world, on = "location", how = "left")
merged_db.copy()


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population,median_age,aged_65_older,aged_70_older,latitude,longitude
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,,,,,38928341.0,18.6,2.581,1.337,33.939110,67.709953
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337,33.939110,67.709953
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337,33.939110,67.709953
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337,33.939110,67.709953
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337,33.939110,67.709953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67890,ZWE,Africa,Zimbabwe,2021-02-05,34331.0,160.0,1303.0,15.0,333403.0,0.073,,,,,14862927.0,19.6,2.822,1.882,-19.015438,29.154857
67891,ZWE,Africa,Zimbabwe,2021-02-06,34487.0,156.0,1316.0,13.0,335489.0,0.064,,,,,14862927.0,19.6,2.822,1.882,-19.015438,29.154857
67892,ZWE,Africa,Zimbabwe,2021-02-07,34552.0,65.0,1326.0,10.0,336553.0,0.064,,,,,14862927.0,19.6,2.822,1.882,-19.015438,29.154857
67893,ZWE,Africa,Zimbabwe,2021-02-08,34658.0,106.0,1339.0,13.0,,,,,,,14862927.0,19.6,2.822,1.882,-19.015438,29.154857


In [20]:
cleaned_df = merged_db.dropna()

In [21]:
def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
    geojson = {'type':'FeatureCollection', 'features':[]}
    for _, row in df.iterrows():
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point',
                               'coordinates':[]}}
        feature['geometry']['coordinates'] = [row[lon],row[lat]]
        for prop in properties:
            feature['properties'][prop] = row[prop]
        geojson['features'].append(feature)
    return geojson

In [22]:
cols = [
     'iso_code',
     'continent',
     'location',
     'date',
     'total_cases',
     'new_cases' , 
     'total_deaths',
     'new_deaths',
     'total_tests',
     'positive_rate',
     'total_vaccinations',
     'people_vaccinated',
     'people_fully_vaccinated',
     'new_vaccinations',
     'population',
     'median_age',
     'aged_65_older',
     'aged_70_older',       
   ]

geojson = df_to_geojson(cleaned_df, cols)
geojson

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'properties': {'iso_code': 'ARG',
    'continent': 'South America',
    'location': 'Argentina',
    'date': '2021-01-21',
    'total_cases': 1843077.0,
    'new_cases': 11396.0,
    'total_deaths': 46355.0,
    'new_deaths': 139.0,
    'total_tests': 5017855.0,
    'positive_rate': 0.257,
    'total_vaccinations': 265724.0,
    'people_vaccinated': 249372.0,
    'people_fully_vaccinated': 16352.0,
    'new_vaccinations': 17791.0,
    'population': 45195777.0,
    'median_age': 31.9,
    'aged_65_older': 11.198,
    'aged_70_older': 7.441},
   'geometry': {'type': 'Point',
    'coordinates': [-63.616671999999994, -38.416097]}},
  {'type': 'Feature',
   'properties': {'iso_code': 'ARG',
    'continent': 'South America',
    'location': 'Argentina',
    'date': '2021-01-22',
    'total_cases': 1853830.0,
    'new_cases': 10753.0,
    'total_deaths': 46575.0,
    'new_deaths': 220.0,
    'total_tests': 5057091.0,
    'posi

In [23]:
import json

output_filename = 'dataset.geojson'
with open(output_filename, 'w') as output_file:
    output_file.write('var dataset = ')
    json.dump(geojson, output_file, indent=2) 

### Export Cleaned Datasets to SQLite

In [17]:
## Create the connection using the imported create_engine function and then invoking the connect method on it
engine = create_engine('sqlite:///VAERS.db', echo=False)
sqlite_connection = engine.connect()

In [18]:
## Set a variable name with the string of a table name
sqlite_vaers = "2021VAERS"
merge_all.to_sql(sqlite_vaers, sqlite_connection, if_exists='fail')

In [19]:
## Close the database connection 
sqlite_connection.close()

In [20]:
## Create the connection using the imported create_engine function and then invoking the connect method on it
engine = create_engine('sqlite:///worldWideData.db', echo=False)
sqlite_connection = engine.connect()

In [21]:
## Set a variable name with the string of a table name
sqlite_worldWideData = "worldWideData"
wanted_columns4.to_sql(sqlite_worldWideData, sqlite_connection, if_exists='fail')

In [22]:
## Close the database connection 
sqlite_connection.close()