## Filtering and Merging 2021VAERS Datasets & Filtering Global Vaccination Dataset

In [1]:
# Import dependencies
import pandas as pd

from sqlalchemy import create_engine

### 2021 VAERS Datasets-Data Filtering and Cleaning

#### 2021VAERSVAX.csv: Filter the data based on vaccination type = COVID19,  and remove unneeded columns

In [2]:
# Import data file
VAERSvax = "dataUsed/2021VAERSVAX.csv"

# Read data file, need the encoding to read properly
VAERSvax_raw = pd.read_csv(VAERSvax, encoding="ISO-8859-1")
display(VAERSvax_raw.head(5))
display(VAERSvax_raw.tail(5))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,916710,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,916741,COVID19,PFIZER\BIONTECH,EH9899,1,SYR,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
2,916742,COVID19,PFIZER\BIONTECH,,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH))
3,916746,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
4,916772,COVID19,PFIZER\BIONTECH,EJ1685,UNK,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
3009,983720,COVID19,MODERNA,039K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
3010,983721,COVID19,MODERNA,039K20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
3011,983766,COVID19,MODERNA,013L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
3012,983919,COVID19,MODERNA,030L20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
3013,985205,COVID19,MODERNA,029L20A,1,IM,UN,COVID19 (COVID19 (MODERNA))


In [3]:
# Extract the columns names
columns = list(VAERSvax_raw.columns)
columns

['VAERS_ID',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [4]:
# Create a dataframe with the wanted columns only
wanted_columns = VAERSvax_raw[['VAERS_ID',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_DOSE_SERIES',
 'VAX_SITE'
]]
display(wanted_columns.head(5))
display(wanted_columns.tail(5))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
0,916710,COVID19,MODERNA,1,LA
1,916741,COVID19,PFIZER\BIONTECH,1,LA
2,916742,COVID19,PFIZER\BIONTECH,1,
3,916746,COVID19,MODERNA,1,LA
4,916772,COVID19,PFIZER\BIONTECH,UNK,LA


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
3009,983720,COVID19,MODERNA,1,LA
3010,983721,COVID19,MODERNA,1,RA
3011,983766,COVID19,MODERNA,1,RA
3012,983919,COVID19,MODERNA,1,LA
3013,985205,COVID19,MODERNA,1,UN


In [5]:
# Filter the data by vax_type = COVID19
# Reset index
filtered_data = wanted_columns[wanted_columns["VAX_TYPE"]=="COVID19"]
filtered_data.reset_index(drop=True, inplace=True)
display(filtered_data.head(5))
display(filtered_data.tail(5))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
0,916710,COVID19,MODERNA,1,LA
1,916741,COVID19,PFIZER\BIONTECH,1,LA
2,916742,COVID19,PFIZER\BIONTECH,1,
3,916746,COVID19,MODERNA,1,LA
4,916772,COVID19,PFIZER\BIONTECH,UNK,LA


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
2839,983720,COVID19,MODERNA,1,LA
2840,983721,COVID19,MODERNA,1,RA
2841,983766,COVID19,MODERNA,1,RA
2842,983919,COVID19,MODERNA,1,LA
2843,985205,COVID19,MODERNA,1,UN


#### 2021VAERSData.csv

In [6]:
# Import data file
VAERSData = "dataUsed/2021VAERSData.csv"

# Read data file, need the encoding to read properly
vaer_data_raw = pd.read_csv(VAERSData, encoding="ISO-8859-1")
display(vaer_data_raw.head(2))
display(vaer_data_raw.tail(2))

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
0,916710,01/01/2021,MO,23.0,23.0,,F,,"Acute appendicitis, onset morning of 1/1/2021 ...",,...,,Hypothyroidism,,,2,01/01/2021,,,Y,NKDA
1,916741,01/01/2021,AR,68.0,68.0,,F,,"on dec 22 I felt some myalgias, chills, fatigu...",,...,had surgery R hand for advanced arthritis 11/1...,Rheumatoid arthritis - mostly affecting R wris...,,,2,01/01/2021,,,,bee stings


Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
2944,983919,01/28/2021,FL,69.0,69.0,,M,,death,Y,...,,,,,2,01/28/2021,,,,
2945,985205,01/29/2021,OH,75.0,75.0,,M,,Patient was feeling dizzy and under the weathe...,Y,...,No,,,,2,01/29/2021,,,,No


In [7]:
# Extract the columns names
columns2 = list(vaer_data_raw.columns)
columns2

['VAERS_ID',
 'RECVDATE',
 'STATE',
 'AGE_YRS',
 'CAGE_YR',
 'CAGE_MO',
 'SEX',
 'RPT_DATE',
 'SYMPTOM_TEXT',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'HOSPDAYS',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'NUMDAYS',
 'LAB_DATA',
 'V_ADMINBY',
 'V_FUNDBY',
 'OTHER_MEDS',
 'CUR_ILL',
 'HISTORY',
 'PRIOR_VAX',
 'SPLTTYPE',
 'FORM_VERS',
 'TODAYS_DATE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'ALLERGIES']

In [8]:
# Create a dataframe with the wanted columns only
wanted_columns2 = vaer_data_raw[['VAERS_ID',
 'STATE',
 'AGE_YRS',
 'SEX',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'HOSPDAYS',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'NUMDAYS'
]]
wanted_columns2.head(5)

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,DATEDIED,L_THREAT,ER_VISIT,HOSPITAL,HOSPDAYS,X_STAY,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS
0,916710,MO,23.0,F,,,Y,,Y,,,,U,12/29/2020,01/01/2021,3.0
1,916741,AR,68.0,F,,,,,,,,Y,N,12/21/2020,12/22/2020,1.0
2,916742,MN,29.0,F,,,Y,,Y,4.0,,,U,12/29/2020,12/29/2020,0.0
3,916746,TX,49.0,F,,,Y,,,,,,Y,12/28/2020,12/28/2020,0.0
4,916772,GA,55.0,M,,,,,,,,,U,12/22/2020,12/26/2020,4.0


#### 2021VAERSSymptoms.csv

In [9]:
# Import data file
VAERSsymptoms = "dataUsed/2021VAERSSYMPTOMS.csv"

# Read data file, need the encoding to read properly
vaer_symptoms_raw = pd.read_csv(VAERSsymptoms, encoding="ISO-8859-1")
display(vaer_symptoms_raw.head(2))
display(vaer_symptoms_raw.tail(2))

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOMVERSION1,SYMPTOM2,SYMPTOMVERSION2,SYMPTOM3,SYMPTOMVERSION3,SYMPTOM4,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5
0,916710,Appendicitis,23.1,Band neutrophil percentage increased,23.1,Surgery,23.1,White blood cell count increased,23.1,,
1,916741,Chills,23.1,Complex regional pain syndrome,23.1,Fatigue,23.1,Headache,23.1,Joint range of motion decreased,23.1


Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOMVERSION1,SYMPTOM2,SYMPTOMVERSION2,SYMPTOM3,SYMPTOMVERSION3,SYMPTOM4,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5
4710,983919,Death,23.1,,,,,,,,
4711,985205,Death,23.1,Dizziness,23.1,Malaise,23.1,,,,


In [10]:
# Extract the columns names
columns3 = list(vaer_symptoms_raw.columns)
columns3

['VAERS_ID',
 'SYMPTOM1',
 'SYMPTOMVERSION1',
 'SYMPTOM2',
 'SYMPTOMVERSION2',
 'SYMPTOM3',
 'SYMPTOMVERSION3',
 'SYMPTOM4',
 'SYMPTOMVERSION4',
 'SYMPTOM5',
 'SYMPTOMVERSION5']

In [11]:
# Create a dataframe with the wanted columns only
wanted_columns3 = vaer_symptoms_raw[['VAERS_ID',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5'
]]
wanted_columns3.tail(5)

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
4707,983721,Death,,,,
4708,983766,Blood pH decreased,Cardiac failure acute,Chest discomfort,Death,Dyspnoea
4709,983766,International normalised ratio increased,N-terminal prohormone brain natriuretic peptid...,SARS-CoV-2 test negative,,
4710,983919,Death,,,,
4711,985205,Death,Dizziness,Malaise,,


### 2021 VAERS Datasets-Merging

In [12]:
# Merge the two organized dataframes
merge_2data = pd.merge(filtered_data,wanted_columns2, on="VAERS_ID", how="left")
display(merge_2data.head(2))
display(merge_2data.tail(2))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,L_THREAT,ER_VISIT,HOSPITAL,HOSPDAYS,X_STAY,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS
0,916710,COVID19,MODERNA,1,LA,MO,23.0,F,,,Y,,Y,,,,U,12/29/2020,01/01/2021,3.0
1,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,,,,,,Y,N,12/21/2020,12/22/2020,1.0


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,L_THREAT,ER_VISIT,HOSPITAL,HOSPDAYS,X_STAY,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS
2842,983919,COVID19,MODERNA,1,LA,FL,69.0,M,Y,,,,,,,,,01/21/2021,01/01/2021,
2843,985205,COVID19,MODERNA,1,UN,OH,75.0,M,Y,01/26/2021,,,,,,,N,01/25/2021,01/26/2021,1.0


In [13]:
# Merge all three organized dataframes
merge_all = pd.merge(merge_2data,wanted_columns3,on="VAERS_ID", how="left")
display(merge_all.head(4))
display(merge_all.tail(4))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,...,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,916710,COVID19,MODERNA,1,LA,MO,23.0,F,,,...,,U,12/29/2020,01/01/2021,3.0,Appendicitis,Band neutrophil percentage increased,Surgery,White blood cell count increased,
1,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,...,Y,N,12/21/2020,12/22/2020,1.0,Chills,Complex regional pain syndrome,Fatigue,Headache,Joint range of motion decreased
2,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,...,Y,N,12/21/2020,12/22/2020,1.0,Myalgia,Pain in extremity,Peripheral swelling,X-ray abnormal,
3,916742,COVID19,PFIZER\BIONTECH,1,,MN,29.0,F,,,...,,U,12/29/2020,12/29/2020,0.0,Anaphylactic reaction,Blood test,Burning sensation,Central venous catheterisation,Dysphonia


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,...,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
4563,983766,COVID19,MODERNA,1,RA,NC,90.0,F,Y,01/24/2021,...,,N,01/16/2021,01/21/2021,5.0,Blood pH decreased,Cardiac failure acute,Chest discomfort,Death,Dyspnoea
4564,983766,COVID19,MODERNA,1,RA,NC,90.0,F,Y,01/24/2021,...,,N,01/16/2021,01/21/2021,5.0,International normalised ratio increased,N-terminal prohormone brain natriuretic peptid...,SARS-CoV-2 test negative,,
4565,983919,COVID19,MODERNA,1,LA,FL,69.0,M,Y,,...,,,01/21/2021,01/01/2021,,Death,,,,
4566,985205,COVID19,MODERNA,1,UN,OH,75.0,M,Y,01/26/2021,...,,N,01/25/2021,01/26/2021,1.0,Death,Dizziness,Malaise,,


### World Wide Dataset-Data Cleaning

In [14]:
# Import data file
worldWideData = "dataUsed/WorldWideData.csv"

# Read data file, need the encoding to read properly
world_raw = pd.read_csv(worldWideData, encoding="ISO-8859-1")
display(world_raw.head(2))
display(world_raw.tail(2))

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
67893,ZWE,Africa,Zimbabwe,2021-02-08,34658.0,106.0,158.571,1339.0,13.0,15.0,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571
67894,ZWE,Africa,Zimbabwe,2021-02-09,34781.0,123.0,138.143,1353.0,14.0,14.143,...,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571


In [16]:
# Extract the columns names
columns4 = list(world_raw.columns)
columns4

['iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'reproduction_rate',
 'icu_patients',
 'icu_patients_per_million',
 'hosp_patients',
 'hosp_patients_per_million',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'new_tests',
 'total_tests',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'positive_rate',
 'tests_per_case',
 'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_hundred',
 'people_fully_vaccinate

In [17]:
# Create a dataframe with the wanted columns only
wanted_columns4 = world_raw[['iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases' , 
 'total_deaths',
 'new_deaths',
 'total_tests',
 'positive_rate',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'population',
 'median_age',
 'aged_65_older',
 'aged_70_older'
]]
display(wanted_columns4.head(4))
display(wanted_columns4.tail(4))

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population,median_age,aged_65_older,aged_70_older
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,,,,,38928341.0,18.6,2.581,1.337
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,,,,,38928341.0,18.6,2.581,1.337


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population,median_age,aged_65_older,aged_70_older
67891,ZWE,Africa,Zimbabwe,2021-02-06,34487.0,156.0,1316.0,13.0,335489.0,0.064,,,,,14862927.0,19.6,2.822,1.882
67892,ZWE,Africa,Zimbabwe,2021-02-07,34552.0,65.0,1326.0,10.0,336553.0,0.064,,,,,14862927.0,19.6,2.822,1.882
67893,ZWE,Africa,Zimbabwe,2021-02-08,34658.0,106.0,1339.0,13.0,,,,,,,14862927.0,19.6,2.822,1.882
67894,ZWE,Africa,Zimbabwe,2021-02-09,34781.0,123.0,1353.0,14.0,,,,,,,14862927.0,19.6,2.822,1.882


In [18]:
# Get the date list
worldwidedate = wanted_columns4["date"]
worldwidedatelist = list(worldwidedate.unique())
worldwidedatelist

['2020-02-24',
 '2020-02-25',
 '2020-02-26',
 '2020-02-27',
 '2020-02-28',
 '2020-02-29',
 '2020-03-01',
 '2020-03-02',
 '2020-03-03',
 '2020-03-04',
 '2020-03-05',
 '2020-03-06',
 '2020-03-07',
 '2020-03-08',
 '2020-03-09',
 '2020-03-10',
 '2020-03-11',
 '2020-03-12',
 '2020-03-13',
 '2020-03-14',
 '2020-03-15',
 '2020-03-16',
 '2020-03-17',
 '2020-03-18',
 '2020-03-19',
 '2020-03-20',
 '2020-03-21',
 '2020-03-22',
 '2020-03-23',
 '2020-03-24',
 '2020-03-25',
 '2020-03-26',
 '2020-03-27',
 '2020-03-28',
 '2020-03-29',
 '2020-03-30',
 '2020-03-31',
 '2020-04-01',
 '2020-04-02',
 '2020-04-03',
 '2020-04-04',
 '2020-04-05',
 '2020-04-06',
 '2020-04-07',
 '2020-04-08',
 '2020-04-09',
 '2020-04-10',
 '2020-04-11',
 '2020-04-12',
 '2020-04-13',
 '2020-04-14',
 '2020-04-15',
 '2020-04-16',
 '2020-04-17',
 '2020-04-18',
 '2020-04-19',
 '2020-04-20',
 '2020-04-21',
 '2020-04-22',
 '2020-04-23',
 '2020-04-24',
 '2020-04-25',
 '2020-04-26',
 '2020-04-27',
 '2020-04-28',
 '2020-04-29',
 '2020-04-

In [21]:
# Get the dates when there is vaccination info
vaccination_info = wanted_columns4.dropna(how="any")
vaccination_info

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population,median_age,aged_65_older,aged_70_older
2809,ARG,South America,Argentina,2021-01-21,1843077.0,11396.0,46355.0,139.0,5017855.0,0.257,265724.0,249372.0,16352.0,17791.0,45195777.0,31.9,11.198,7.441
2810,ARG,South America,Argentina,2021-01-22,1853830.0,10753.0,46575.0,220.0,5057091.0,0.258,279602.0,254456.0,25146.0,13878.0,45195777.0,31.9,11.198,7.441
2811,ARG,South America,Argentina,2021-01-23,1862192.0,8362.0,46737.0,162.0,5085646.0,0.264,288064.0,258876.0,29188.0,8462.0,45195777.0,31.9,11.198,7.441
2812,ARG,South America,Argentina,2021-01-24,1867223.0,5031.0,46827.0,90.0,5105622.0,0.260,292023.0,260036.0,31987.0,3959.0,45195777.0,31.9,11.198,7.441
2813,ARG,South America,Argentina,2021-01-25,1874801.0,7578.0,47034.0,207.0,5144717.0,0.262,292386.0,260122.0,32264.0,363.0,45195777.0,31.9,11.198,7.441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64338,GBR,Europe,United Kingdom,2021-02-01,3846851.0,18668.0,106774.0,407.0,71011933.0,0.036,10143511.0,9646715.0,496796.0,352935.0,67886004.0,40.8,18.517,12.527
64339,GBR,Europe,United Kingdom,2021-02-02,3863757.0,16906.0,108225.0,1451.0,71642534.0,0.036,10520433.0,10021471.0,498962.0,376922.0,67886004.0,40.8,18.517,12.527
64340,GBR,Europe,United Kingdom,2021-02-03,3882972.0,19215.0,109547.0,1322.0,72464146.0,0.034,10992444.0,10490487.0,501957.0,472011.0,67886004.0,40.8,18.517,12.527
64341,GBR,Europe,United Kingdom,2021-02-04,3903706.0,20734.0,110462.0,915.0,73277874.0,0.032,11477040.0,10971047.0,505993.0,484596.0,67886004.0,40.8,18.517,12.527


In [22]:
# Get the date list
dorpnadate = vaccination_info["date"]
dorpnadatelist = list(dorpnadate.unique())
dorpnadatelist

['2021-01-21',
 '2021-01-22',
 '2021-01-23',
 '2021-01-24',
 '2021-01-25',
 '2021-01-26',
 '2021-01-27',
 '2021-01-28',
 '2021-01-29',
 '2021-01-30',
 '2021-01-18',
 '2021-01-19',
 '2021-01-20',
 '2021-01-31',
 '2021-02-01',
 '2021-02-02',
 '2021-02-03',
 '2021-02-04',
 '2021-02-05',
 '2021-02-06',
 '2021-01-14',
 '2021-01-15',
 '2021-01-16',
 '2021-01-17',
 '2021-02-07',
 '2021-01-04',
 '2021-01-05',
 '2021-01-06',
 '2021-01-07',
 '2021-01-08',
 '2021-01-09',
 '2021-01-10',
 '2021-01-11',
 '2021-01-12',
 '2021-01-13']

In [23]:
# Turn into a dataframe for organization
dorpnadateDf = pd.DataFrame(dorpnadatelist)
dorpnadateDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       35 non-null     object
dtypes: object(1)
memory usage: 408.0+ bytes


In [24]:
# Convert to datetime type
dorpnadateDf[0]=dorpnadateDf[0].astype('datetime64[ns]')
dorpnadateDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   0       35 non-null     datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 408.0 bytes


In [25]:
# Organize
dorpnadateorganized = dorpnadateDf.sort_values(by=0)
dorpnadateorganized.reset_index(drop=True, inplace=True)
dorpnadateorganized

Unnamed: 0,0
0,2021-01-04
1,2021-01-05
2,2021-01-06
3,2021-01-07
4,2021-01-08
5,2021-01-09
6,2021-01-10
7,2021-01-11
8,2021-01-12
9,2021-01-13


In [28]:
# Remove unneeded dates
neededDates = dorpnadateorganized.drop(dorpnadateorganized.index[24:35])
neededDates.reset_index(drop=True, inplace=True)
neededDates

Unnamed: 0,0
0,2021-01-04
1,2021-01-05
2,2021-01-06
3,2021-01-07
4,2021-01-08
5,2021-01-09
6,2021-01-10
7,2021-01-11
8,2021-01-12
9,2021-01-13


#### Filter By US Data

In [33]:
# Pull out US data
us_info = wanted_columns4.loc[wanted_columns4["location"]=="United States"]
us_info.reset_index(drop=True, inplace=True)
us_info

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population,median_age,aged_65_older,aged_70_older
0,USA,North America,United States,2020-01-22,1.0,,,,,,,,,,331002647.0,38.3,15.413,9.732
1,USA,North America,United States,2020-01-23,1.0,0.0,,,,,,,,,331002647.0,38.3,15.413,9.732
2,USA,North America,United States,2020-01-24,2.0,1.0,,,,,,,,,331002647.0,38.3,15.413,9.732
3,USA,North America,United States,2020-01-25,2.0,0.0,,,,,,,,,331002647.0,38.3,15.413,9.732
4,USA,North America,United States,2020-01-26,5.0,3.0,,,,,,,,,331002647.0,38.3,15.413,9.732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,USA,North America,United States,2021-02-05,26813772.0,133558.0,459555.0,3674.0,,,36819212.0,28909497.0,7503864.0,1615502.0,331002647.0,38.3,15.413,9.732
381,USA,North America,United States,2021-02-06,26917787.0,104015.0,462169.0,2614.0,,,39037964.0,30250964.0,8317180.0,2218752.0,331002647.0,38.3,15.413,9.732
382,USA,North America,United States,2021-02-07,27007368.0,89581.0,463476.0,1307.0,,,41210937.0,31579100.0,9147185.0,2172973.0,331002647.0,38.3,15.413,9.732
383,USA,North America,United States,2021-02-08,27097095.0,89727.0,465072.0,1596.0,,,42417617.0,32340146.0,9518015.0,1206680.0,331002647.0,38.3,15.413,9.732


In [34]:
us_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385 entries, 0 to 384
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   iso_code                 385 non-null    object 
 1   continent                385 non-null    object 
 2   location                 385 non-null    object 
 3   date                     385 non-null    object 
 4   total_cases              385 non-null    float64
 5   new_cases                384 non-null    float64
 6   total_deaths             347 non-null    float64
 7   new_deaths               347 non-null    float64
 8   total_tests              339 non-null    float64
 9   positive_rate            48 non-null     float64
 10  total_vaccinations       39 non-null     float64
 11  people_vaccinated        38 non-null     float64
 12  people_fully_vaccinated  24 non-null     float64
 13  new_vaccinations         30 non-null     float64
 14  population               3

In [35]:
# Create a Converted Data column
us_info_copy = us_info.copy()
us_info_covert = us_info["date"].astype('datetime64[ns]')
us_info_copy["CONVERTED_DATE"] = us_info_covert
us_info_copy

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population,median_age,aged_65_older,aged_70_older,CONVERTED_DATE
0,USA,North America,United States,2020-01-22,1.0,,,,,,,,,,331002647.0,38.3,15.413,9.732,2020-01-22
1,USA,North America,United States,2020-01-23,1.0,0.0,,,,,,,,,331002647.0,38.3,15.413,9.732,2020-01-23
2,USA,North America,United States,2020-01-24,2.0,1.0,,,,,,,,,331002647.0,38.3,15.413,9.732,2020-01-24
3,USA,North America,United States,2020-01-25,2.0,0.0,,,,,,,,,331002647.0,38.3,15.413,9.732,2020-01-25
4,USA,North America,United States,2020-01-26,5.0,3.0,,,,,,,,,331002647.0,38.3,15.413,9.732,2020-01-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,USA,North America,United States,2021-02-05,26813772.0,133558.0,459555.0,3674.0,,,36819212.0,28909497.0,7503864.0,1615502.0,331002647.0,38.3,15.413,9.732,2021-02-05
381,USA,North America,United States,2021-02-06,26917787.0,104015.0,462169.0,2614.0,,,39037964.0,30250964.0,8317180.0,2218752.0,331002647.0,38.3,15.413,9.732,2021-02-06
382,USA,North America,United States,2021-02-07,27007368.0,89581.0,463476.0,1307.0,,,41210937.0,31579100.0,9147185.0,2172973.0,331002647.0,38.3,15.413,9.732,2021-02-07
383,USA,North America,United States,2021-02-08,27097095.0,89727.0,465072.0,1596.0,,,42417617.0,32340146.0,9518015.0,1206680.0,331002647.0,38.3,15.413,9.732,2021-02-08


In [36]:
# Create a desired date list
date_list = list(neededDates[0])
date_list

[Timestamp('2021-01-04 00:00:00'),
 Timestamp('2021-01-05 00:00:00'),
 Timestamp('2021-01-06 00:00:00'),
 Timestamp('2021-01-07 00:00:00'),
 Timestamp('2021-01-08 00:00:00'),
 Timestamp('2021-01-09 00:00:00'),
 Timestamp('2021-01-10 00:00:00'),
 Timestamp('2021-01-11 00:00:00'),
 Timestamp('2021-01-12 00:00:00'),
 Timestamp('2021-01-13 00:00:00'),
 Timestamp('2021-01-14 00:00:00'),
 Timestamp('2021-01-15 00:00:00'),
 Timestamp('2021-01-16 00:00:00'),
 Timestamp('2021-01-17 00:00:00'),
 Timestamp('2021-01-18 00:00:00'),
 Timestamp('2021-01-19 00:00:00'),
 Timestamp('2021-01-20 00:00:00'),
 Timestamp('2021-01-21 00:00:00'),
 Timestamp('2021-01-22 00:00:00'),
 Timestamp('2021-01-23 00:00:00'),
 Timestamp('2021-01-24 00:00:00'),
 Timestamp('2021-01-25 00:00:00'),
 Timestamp('2021-01-26 00:00:00'),
 Timestamp('2021-01-27 00:00:00')]

In [37]:
# Filter out the content based on the content of date list
us_info_filter = us_info_copy[us_info_copy.CONVERTED_DATE.isin(date_list)]
us_info_filter.reset_index(drop=True, inplace=True)
us_info_filter

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population,median_age,aged_65_older,aged_70_older,CONVERTED_DATE
0,USA,North America,United States,2021-01-04,20906021.0,183790.0,354873.0,2066.0,256512858.0,0.146,4563260.0,4563260.0,,,331002647.0,38.3,15.413,9.732,2021-01-04
1,USA,North America,United States,2021-01-05,21139547.0,233526.0,358539.0,3666.0,258344484.0,,4836469.0,4836469.0,,273209.0,331002647.0,38.3,15.413,9.732,2021-01-05
2,USA,North America,United States,2021-01-06,21393464.0,253917.0,362432.0,3893.0,260522788.0,,5306797.0,5306797.0,,470328.0,331002647.0,38.3,15.413,9.732,2021-01-06
3,USA,North America,United States,2021-01-07,21670202.0,276738.0,366377.0,3945.0,262643869.0,,5919418.0,5919418.0,,612621.0,331002647.0,38.3,15.413,9.732,2021-01-07
4,USA,North America,United States,2021-01-08,21962246.0,292044.0,370408.0,4031.0,264672008.0,,6688231.0,6688231.0,,768813.0,331002647.0,38.3,15.413,9.732,2021-01-08
5,USA,North America,United States,2021-01-09,22224220.0,261974.0,373653.0,3245.0,266312503.0,,,,,,331002647.0,38.3,15.413,9.732,2021-01-09
6,USA,North America,United States,2021-01-10,22437501.0,213281.0,375516.0,1863.0,267422755.0,,,,,,331002647.0,38.3,15.413,9.732,2021-01-10
7,USA,North America,United States,2021-01-11,22651464.0,213963.0,377561.0,2045.0,268850460.0,0.12,8987322.0,8987322.0,,,331002647.0,38.3,15.413,9.732,2021-01-11
8,USA,North America,United States,2021-01-12,22877702.0,226238.0,381993.0,4432.0,270739758.0,,9327138.0,9327138.0,,339816.0,331002647.0,38.3,15.413,9.732,2021-01-12
9,USA,North America,United States,2021-01-13,23107573.0,229871.0,385961.0,3968.0,272697680.0,,10278462.0,,,951324.0,331002647.0,38.3,15.413,9.732,2021-01-13


### Export Cleaned Datasets to SQLite

In [17]:
## Create the connection using the imported create_engine function and then invoking the connect method on it
engine = create_engine('sqlite:///VAERS.db', echo=False)
sqlite_connection = engine.connect()

In [18]:
## Set a variable name with the string of a table name
sqlite_vaers = "2021VAERS"
merge_all.to_sql(sqlite_vaers, sqlite_connection, if_exists='fail')

In [19]:
## Close the database connection 
sqlite_connection.close()

In [20]:
## Create the connection using the imported create_engine function and then invoking the connect method on it
engine = create_engine('sqlite:///worldWideData.db', echo=False)
sqlite_connection = engine.connect()

In [21]:
## Set a variable name with the string of a table name
sqlite_worldWideData = "worldWideData"
wanted_columns4.to_sql(sqlite_worldWideData, sqlite_connection, if_exists='fail')

In [22]:
## Close the database connection 
sqlite_connection.close()