## Filtering and Merging 2021VAERS Datasets & Filtering Global Vaccination Dataset

In [1]:
# Import dependencies
import pandas as pd

from sqlalchemy import create_engine

### 2021 VAERS Datasets-Data Filtering and Cleaning

#### 2021VAERSVAX.csv: Filter the data based on vaccination type = COVID19,  and remove unneeded columns

In [2]:
# Import data file
VAERSvax = "dataUsed/2021VAERSVAX.csv"

# Read data file, need the encoding to read properly
VAERSvax_raw = pd.read_csv(VAERSvax, encoding="ISO-8859-1")
display(VAERSvax_raw.head(5))
display(VAERSvax_raw.tail(5))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,916710,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,916741,COVID19,PFIZER\BIONTECH,EH9899,1,SYR,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
2,916742,COVID19,PFIZER\BIONTECH,,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH))
3,916746,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
4,916772,COVID19,PFIZER\BIONTECH,EJ1685,UNK,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
3009,983720,COVID19,MODERNA,039K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
3010,983721,COVID19,MODERNA,039K20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
3011,983766,COVID19,MODERNA,013L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
3012,983919,COVID19,MODERNA,030L20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
3013,985205,COVID19,MODERNA,029L20A,1,IM,UN,COVID19 (COVID19 (MODERNA))


In [3]:
# Extract the columns names
columns = list(VAERSvax_raw.columns)
columns

['VAERS_ID',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [4]:
# Create a dataframe with the wanted columns only
wanted_columns = VAERSvax_raw[['VAERS_ID',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_DOSE_SERIES',
 'VAX_SITE'
]]
display(wanted_columns.head(5))
display(wanted_columns.tail(5))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
0,916710,COVID19,MODERNA,1,LA
1,916741,COVID19,PFIZER\BIONTECH,1,LA
2,916742,COVID19,PFIZER\BIONTECH,1,
3,916746,COVID19,MODERNA,1,LA
4,916772,COVID19,PFIZER\BIONTECH,UNK,LA


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
3009,983720,COVID19,MODERNA,1,LA
3010,983721,COVID19,MODERNA,1,RA
3011,983766,COVID19,MODERNA,1,RA
3012,983919,COVID19,MODERNA,1,LA
3013,985205,COVID19,MODERNA,1,UN


In [5]:
# Filter the data by vax_type = COVID19
# Reset index
filtered_data = wanted_columns[wanted_columns["VAX_TYPE"]=="COVID19"]
filtered_data.reset_index(drop=True, inplace=True)
display(filtered_data.head(5))
display(filtered_data.tail(5))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
0,916710,COVID19,MODERNA,1,LA
1,916741,COVID19,PFIZER\BIONTECH,1,LA
2,916742,COVID19,PFIZER\BIONTECH,1,
3,916746,COVID19,MODERNA,1,LA
4,916772,COVID19,PFIZER\BIONTECH,UNK,LA


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE
2839,983720,COVID19,MODERNA,1,LA
2840,983721,COVID19,MODERNA,1,RA
2841,983766,COVID19,MODERNA,1,RA
2842,983919,COVID19,MODERNA,1,LA
2843,985205,COVID19,MODERNA,1,UN


#### 2021VAERSData.csv

In [6]:
# Import data file
VAERSData = "dataUsed/2021VAERSData.csv"

# Read data file, need the encoding to read properly
vaer_data_raw = pd.read_csv(VAERSData, encoding="ISO-8859-1")
display(vaer_data_raw.head(2))
display(vaer_data_raw.tail(2))

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
0,916710,01/01/2021,MO,23.0,23.0,,F,,"Acute appendicitis, onset morning of 1/1/2021 ...",,...,,Hypothyroidism,,,2,01/01/2021,,,Y,NKDA
1,916741,01/01/2021,AR,68.0,68.0,,F,,"on dec 22 I felt some myalgias, chills, fatigu...",,...,had surgery R hand for advanced arthritis 11/1...,Rheumatoid arthritis - mostly affecting R wris...,,,2,01/01/2021,,,,bee stings


Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
2944,983919,01/28/2021,FL,69.0,69.0,,M,,death,Y,...,,,,,2,01/28/2021,,,,
2945,985205,01/29/2021,OH,75.0,75.0,,M,,Patient was feeling dizzy and under the weathe...,Y,...,No,,,,2,01/29/2021,,,,No


In [7]:
# Extract the columns names
columns2 = list(vaer_data_raw.columns)
columns2

['VAERS_ID',
 'RECVDATE',
 'STATE',
 'AGE_YRS',
 'CAGE_YR',
 'CAGE_MO',
 'SEX',
 'RPT_DATE',
 'SYMPTOM_TEXT',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'HOSPDAYS',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'NUMDAYS',
 'LAB_DATA',
 'V_ADMINBY',
 'V_FUNDBY',
 'OTHER_MEDS',
 'CUR_ILL',
 'HISTORY',
 'PRIOR_VAX',
 'SPLTTYPE',
 'FORM_VERS',
 'TODAYS_DATE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'ALLERGIES']

In [8]:
# Create a dataframe with the wanted columns only
wanted_columns2 = vaer_data_raw[['VAERS_ID',
 'STATE',
 'AGE_YRS',
 'SEX',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'HOSPDAYS',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'NUMDAYS'
]]
wanted_columns2.head(5)

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,DATEDIED,L_THREAT,ER_VISIT,HOSPITAL,HOSPDAYS,X_STAY,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS
0,916710,MO,23.0,F,,,Y,,Y,,,,U,12/29/2020,01/01/2021,3.0
1,916741,AR,68.0,F,,,,,,,,Y,N,12/21/2020,12/22/2020,1.0
2,916742,MN,29.0,F,,,Y,,Y,4.0,,,U,12/29/2020,12/29/2020,0.0
3,916746,TX,49.0,F,,,Y,,,,,,Y,12/28/2020,12/28/2020,0.0
4,916772,GA,55.0,M,,,,,,,,,U,12/22/2020,12/26/2020,4.0


#### 2021VAERSSymptoms.csv

In [9]:
# Import data file
VAERSsymptoms = "dataUsed/2021VAERSSYMPTOMS.csv"

# Read data file, need the encoding to read properly
vaer_symptoms_raw = pd.read_csv(VAERSsymptoms, encoding="ISO-8859-1")
display(vaer_symptoms_raw.head(2))
display(vaer_symptoms_raw.tail(2))

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOMVERSION1,SYMPTOM2,SYMPTOMVERSION2,SYMPTOM3,SYMPTOMVERSION3,SYMPTOM4,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5
0,916710,Appendicitis,23.1,Band neutrophil percentage increased,23.1,Surgery,23.1,White blood cell count increased,23.1,,
1,916741,Chills,23.1,Complex regional pain syndrome,23.1,Fatigue,23.1,Headache,23.1,Joint range of motion decreased,23.1


Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOMVERSION1,SYMPTOM2,SYMPTOMVERSION2,SYMPTOM3,SYMPTOMVERSION3,SYMPTOM4,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5
4710,983919,Death,23.1,,,,,,,,
4711,985205,Death,23.1,Dizziness,23.1,Malaise,23.1,,,,


In [10]:
# Extract the columns names
columns3 = list(vaer_symptoms_raw.columns)
columns3

['VAERS_ID',
 'SYMPTOM1',
 'SYMPTOMVERSION1',
 'SYMPTOM2',
 'SYMPTOMVERSION2',
 'SYMPTOM3',
 'SYMPTOMVERSION3',
 'SYMPTOM4',
 'SYMPTOMVERSION4',
 'SYMPTOM5',
 'SYMPTOMVERSION5']

In [11]:
# Create a dataframe with the wanted columns only
wanted_columns3 = vaer_symptoms_raw[['VAERS_ID',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5'
]]
wanted_columns3.tail(5)

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
4707,983721,Death,,,,
4708,983766,Blood pH decreased,Cardiac failure acute,Chest discomfort,Death,Dyspnoea
4709,983766,International normalised ratio increased,N-terminal prohormone brain natriuretic peptid...,SARS-CoV-2 test negative,,
4710,983919,Death,,,,
4711,985205,Death,Dizziness,Malaise,,


### 2021 VAERS Datasets-Merging

In [12]:
# Merge the two organized dataframes
merge_2data = pd.merge(filtered_data,wanted_columns2, on="VAERS_ID", how="left")
display(merge_2data.head(2))
display(merge_2data.tail(2))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,L_THREAT,ER_VISIT,HOSPITAL,HOSPDAYS,X_STAY,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS
0,916710,COVID19,MODERNA,1,LA,MO,23.0,F,,,Y,,Y,,,,U,12/29/2020,01/01/2021,3.0
1,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,,,,,,Y,N,12/21/2020,12/22/2020,1.0


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,L_THREAT,ER_VISIT,HOSPITAL,HOSPDAYS,X_STAY,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS
2842,983919,COVID19,MODERNA,1,LA,FL,69.0,M,Y,,,,,,,,,01/21/2021,01/01/2021,
2843,985205,COVID19,MODERNA,1,UN,OH,75.0,M,Y,01/26/2021,,,,,,,N,01/25/2021,01/26/2021,1.0


In [13]:
# Merge all three organized dataframes
merge_all = pd.merge(merge_2data,wanted_columns3,on="VAERS_ID", how="left")
display(merge_all.head(4))
display(merge_all.tail(4))

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,...,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,916710,COVID19,MODERNA,1,LA,MO,23.0,F,,,...,,U,12/29/2020,01/01/2021,3.0,Appendicitis,Band neutrophil percentage increased,Surgery,White blood cell count increased,
1,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,...,Y,N,12/21/2020,12/22/2020,1.0,Chills,Complex regional pain syndrome,Fatigue,Headache,Joint range of motion decreased
2,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,...,Y,N,12/21/2020,12/22/2020,1.0,Myalgia,Pain in extremity,Peripheral swelling,X-ray abnormal,
3,916742,COVID19,PFIZER\BIONTECH,1,,MN,29.0,F,,,...,,U,12/29/2020,12/29/2020,0.0,Anaphylactic reaction,Blood test,Burning sensation,Central venous catheterisation,Dysphonia


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,...,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
4563,983766,COVID19,MODERNA,1,RA,NC,90.0,F,Y,01/24/2021,...,,N,01/16/2021,01/21/2021,5.0,Blood pH decreased,Cardiac failure acute,Chest discomfort,Death,Dyspnoea
4564,983766,COVID19,MODERNA,1,RA,NC,90.0,F,Y,01/24/2021,...,,N,01/16/2021,01/21/2021,5.0,International normalised ratio increased,N-terminal prohormone brain natriuretic peptid...,SARS-CoV-2 test negative,,
4565,983919,COVID19,MODERNA,1,LA,FL,69.0,M,Y,,...,,,01/21/2021,01/01/2021,,Death,,,,
4566,985205,COVID19,MODERNA,1,UN,OH,75.0,M,Y,01/26/2021,...,,N,01/25/2021,01/26/2021,1.0,Death,Dizziness,Malaise,,


# Data Checking

### All Symtoms

In [42]:
# Pull out symptom 1
sym_one = merge_all[["SYMPTOM1"]]
sym_one

Unnamed: 0,SYMPTOM1
0,Appendicitis
1,Chills
2,Myalgia
3,Anaphylactic reaction
4,Intensive care
...,...
4562,Death
4563,Blood pH decreased
4564,International normalised ratio increased
4565,Death


In [43]:
# Check the symptom 2
sym_two = merge_all[["SYMPTOM2"]]
sym_two

Unnamed: 0,SYMPTOM2
0,Band neutrophil percentage increased
1,Complex regional pain syndrome
2,Pain in extremity
3,Blood test
4,Pruritus
...,...
4562,
4563,Cardiac failure acute
4564,N-terminal prohormone brain natriuretic peptid...
4565,


In [44]:
# Check the symptom 3
sym_three = merge_all[["SYMPTOM3"]]
sym_three

Unnamed: 0,SYMPTOM3
0,Surgery
1,Fatigue
2,Peripheral swelling
3,Burning sensation
4,Rash
...,...
4562,
4563,Chest discomfort
4564,SARS-CoV-2 test negative
4565,


In [45]:
# Check the symptom 4
sym_four = merge_all[["SYMPTOM4"]]
sym_four

Unnamed: 0,SYMPTOM4
0,White blood cell count increased
1,Headache
2,X-ray abnormal
3,Central venous catheterisation
4,Rash macular
...,...
4562,
4563,Death
4564,
4565,


In [46]:
# Check the symptom 5
sym_five = merge_all[["SYMPTOM5"]]
sym_five

Unnamed: 0,SYMPTOM5
0,
1,Joint range of motion decreased
2,
3,Dysphonia
4,Throat tightness
...,...
4562,
4563,Dyspnoea
4564,
4565,


In [60]:
# Combine all columns into one
sym_combine = pd.concat(map(merge_all.get, ["SYMPTOM1", "SYMPTOM2", "SYMPTOM3", "SYMPTOM4", "SYMPTOM5"])).reset_index(drop=True)
sym_combine

0                 Appendicitis
1                       Chills
2                      Myalgia
3        Anaphylactic reaction
4               Intensive care
                 ...          
22830                      NaN
22831                 Dyspnoea
22832                      NaN
22833                      NaN
22834                      NaN
Length: 22835, dtype: object

In [63]:
# Turn it into a dataframe
sym_combine_df = pd.DataFrame(sym_combine)
sym_combine_df

Unnamed: 0,0
0,Appendicitis
1,Chills
2,Myalgia
3,Anaphylactic reaction
4,Intensive care
...,...
22830,
22831,Dyspnoea
22832,
22833,


In [68]:
# Get the count for each symptom
testing = pd.DataFrame(sym_combine_df.value_counts())
testing_count = testing.rename(columns={0:"count"})
testing_count.head(20)

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
SARS-CoV-2 test positive,575
Headache,362
Death,359
Pyrexia,358
Dyspnoea,310
Fatigue,269
Dizziness,257
Nausea,246
Chills,246
Pain,246


### Dose 1 All Symptoms

In [71]:
# Groupby dose
all_dose = merge_all.groupby("VAX_DOSE_SERIES")
all_dose.tail(10)

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,...,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
273,918552,COVID19,MODERNA,5,LA,IL,39.0,F,,,...,,Y,12/30/2020,01/01/2021,2.0,Chills,Diarrhoea,Influenza virus test negative,Nausea,Pyrexia
274,918552,COVID19,MODERNA,5,LA,IL,39.0,F,,,...,,Y,12/30/2020,01/01/2021,2.0,SARS-CoV-2 test negative,,,,
1947,940602,COVID19,MODERNA,5,LA,TX,83.0,M,Y,,...,,,01/08/2021,01/10/2021,2.0,Death,Injection site pain,Myocardial infarction,Pain,Syncope
2348,946780,COVID19,MODERNA,,,FL,33.0,F,,,...,,N,12/04/2020,01/12/2021,39.0,Angiogram cerebral abnormal,Cerebral haemorrhage,Cerebral venous sinus thrombosis,Computerised tomogram head abnormal,Decompressive craniectomy
2349,946780,COVID19,MODERNA,,,FL,33.0,F,,,...,,N,12/04/2020,01/12/2021,39.0,Hemiparesis,Subarachnoid haemorrhage,,,
3495,963904,COVID19,PFIZER\BIONTECH,4,LA,,27.0,F,,,...,,N,01/11/2021,01/11/2021,0.0,Asthenia,Capillary nail refill test,Grip strength decreased,Hypoaesthesia,Impaired work ability
3496,963904,COVID19,PFIZER\BIONTECH,4,LA,,27.0,F,,,...,,N,01/11/2021,01/11/2021,0.0,Injected limb mobility decreased,Nerve injury,Paraesthesia,,
3797,969219,COVID19,MODERNA,7+,LA,SC,61.0,M,Y,01/24/2021,...,,N,01/23/2021,01/24/2021,1.0,Death,Syncope,,,
3910,971271,COVID19,PFIZER\BIONTECH,3,,NY,74.0,F,,,...,,U,01/12/2021,01/12/2021,0.0,Extra dose administered,Immediate post-injection reaction,SARS-CoV-2 test,Somnolence,
4445,976939,COVID19,PFIZER\BIONTECH,2,LA,NJ,50.0,F,,,...,Y,N,01/12/2021,01/12/2021,0.0,Muscle contractions involuntary,Muscle spasms,Myalgia,Nausea,Pain


In [74]:
# Extracting dose 1 from dose series
dose_one = merge_all.loc[merge_all["VAX_DOSE_SERIES"]=="1"]
dose_one

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,...,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,916710,COVID19,MODERNA,1,LA,MO,23.0,F,,,...,,U,12/29/2020,01/01/2021,3.0,Appendicitis,Band neutrophil percentage increased,Surgery,White blood cell count increased,
1,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,...,Y,N,12/21/2020,12/22/2020,1.0,Chills,Complex regional pain syndrome,Fatigue,Headache,Joint range of motion decreased
2,916741,COVID19,PFIZER\BIONTECH,1,LA,AR,68.0,F,,,...,Y,N,12/21/2020,12/22/2020,1.0,Myalgia,Pain in extremity,Peripheral swelling,X-ray abnormal,
3,916742,COVID19,PFIZER\BIONTECH,1,,MN,29.0,F,,,...,,U,12/29/2020,12/29/2020,0.0,Anaphylactic reaction,Blood test,Burning sensation,Central venous catheterisation,Dysphonia
4,916742,COVID19,PFIZER\BIONTECH,1,,MN,29.0,F,,,...,,U,12/29/2020,12/29/2020,0.0,Intensive care,Pruritus,Rash,Rash macular,Throat tightness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4562,983721,COVID19,MODERNA,1,RA,,72.0,M,Y,01/23/2021,...,,U,01/13/2021,01/23/2021,10.0,Death,,,,
4563,983766,COVID19,MODERNA,1,RA,NC,90.0,F,Y,01/24/2021,...,,N,01/16/2021,01/21/2021,5.0,Blood pH decreased,Cardiac failure acute,Chest discomfort,Death,Dyspnoea
4564,983766,COVID19,MODERNA,1,RA,NC,90.0,F,Y,01/24/2021,...,,N,01/16/2021,01/21/2021,5.0,International normalised ratio increased,N-terminal prohormone brain natriuretic peptid...,SARS-CoV-2 test negative,,
4565,983919,COVID19,MODERNA,1,LA,FL,69.0,M,Y,,...,,,01/21/2021,01/01/2021,,Death,,,,


In [75]:
# Combine all columns into one
dose_combine = pd.concat(map(dose_one.get, ["SYMPTOM1", "SYMPTOM2", "SYMPTOM3", "SYMPTOM4", "SYMPTOM5"])).reset_index(drop=True)
dose_combine

0                 Appendicitis
1                       Chills
2                      Myalgia
3        Anaphylactic reaction
4               Intensive care
                 ...          
16200                      NaN
16201                 Dyspnoea
16202                      NaN
16203                      NaN
16204                      NaN
Length: 16205, dtype: object

In [77]:
# Turn it into a dataframe
dose_combine_df = pd.DataFrame(dose_combine)
dose_combine_df

Unnamed: 0,0
0,Appendicitis
1,Chills
2,Myalgia
3,Anaphylactic reaction
4,Intensive care
...,...
16200,
16201,Dyspnoea
16202,
16203,


#### Dose 1

In [78]:
# Get the count for each symptom
dose_one_sym = pd.DataFrame(dose_combine_df.value_counts())
dose_one_sym_count = dose_one_sym.rename(columns={0:"count"})
dose_one_sym_count.head(20)

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
SARS-CoV-2 test positive,435
Death,280
Headache,246
Pyrexia,232
Dyspnoea,222
Dizziness,187
Fatigue,186
Nausea,163
Chills,160
Pain,152


In [79]:
# Extracting dose 2 from dose series
dose_two = merge_all.loc[merge_all["VAX_DOSE_SERIES"]=="2"]
dose_two

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_SITE,STATE,AGE_YRS,SEX,DIED,DATEDIED,...,DISABLE,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
73,918051,COVID19,MODERNA,2,RA,NJ,33.0,F,,,...,,Y,12/30/2020,01/01/2021,2.0,Abdominal pain,Appendicectomy,Appendicitis,Computerised tomogram abdomen abnormal,Laboratory test
74,918051,COVID19,MODERNA,2,RA,NJ,33.0,F,,,...,,Y,12/30/2020,01/01/2021,2.0,Liver function test,Ultrasound abdomen,Ultrasound pelvis,,
225,918277,COVID19,PFIZER\BIONTECH,2,LA,FL,73.0,M,,,...,,N,01/03/2021,01/04/2021,1.0,Muscular weakness,,,,
370,919022,COVID19,MODERNA,2,LA,MD,28.0,M,,,...,Y,N,09/02/2020,09/02/2020,0.0,Autoantibody positive,Blindness unilateral,Computerised tomogram coronary artery,Echocardiogram,Electrocardiogram
371,919022,COVID19,MODERNA,2,LA,MD,28.0,M,,,...,Y,N,09/02/2020,09/02/2020,0.0,Electrocardiogram ambulatory,Magnetic resonance imaging brain,Ophthalmological examination,Postural orthostatic tachycardia syndrome,Pyrexia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4451,976965,COVID19,PFIZER\BIONTECH,2,LA,AZ,45.0,M,,,...,,N,01/13/2021,01/14/2021,1.0,Myalgia,Pyrexia,SARS-CoV-2 test negative,Tachycardia,Troponin increased
4513,981141,COVID19,PFIZER\BIONTECH,2,LA,GA,34.0,F,,,...,,U,01/22/2021,01/23/2021,1.0,Chills,Dyspnoea,Pain,Pyrexia,
4514,981225,COVID19,PFIZER\BIONTECH,2,UN,MD,59.0,F,Y,01/27/2021,...,,N,01/27/2021,01/27/2021,0.0,Abdominal pain,Death,Pulse absent,Unresponsive to stimuli,Vomiting
4519,981912,COVID19,MODERNA,2,UN,CA,67.0,F,Y,01/27/2021,...,,N,01/23/2021,01/23/2021,0.0,Angiogram pulmonary abnormal,Cardiac arrest,Chest pain,Cold sweat,Death


In [80]:
# Combine all columns into one
dose2_combine = pd.concat(map(dose_two.get, ["SYMPTOM1", "SYMPTOM2", "SYMPTOM3", "SYMPTOM4", "SYMPTOM5"])).reset_index(drop=True)
dose2_combine

0                     Abdominal pain
1                Liver function test
2                  Muscular weakness
3              Autoantibody positive
4       Electrocardiogram ambulatory
                    ...             
2590              Troponin increased
2591                             NaN
2592                        Vomiting
2593                           Death
2594              Pulmonary embolism
Length: 2595, dtype: object

In [81]:
# Turn it into a dataframe
dose2_combine_df = pd.DataFrame(dose2_combine)
dose2_combine_df

Unnamed: 0,0
0,Abdominal pain
1,Liver function test
2,Muscular weakness
3,Autoantibody positive
4,Electrocardiogram ambulatory
...,...
2590,Troponin increased
2591,
2592,Vomiting
2593,Death


#### Dose 2

In [82]:
# Get the count for each symptom
dose_two_sym = pd.DataFrame(dose2_combine_df.value_counts())
dose_two_sym_count = dose_two_sym.rename(columns={0:"count"})
dose_two_sym_count.head(20)

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
Pyrexia,61
Headache,51
Chills,48
Pain,46
Dyspnoea,44
Fatigue,39
Nausea,39
Dizziness,30
Chest pain,27
Vomiting,27


### Unqiue id/patient count for dose 1

In [84]:
# Get the dose one unique patient count
dose_one_patient = dose_one["VAERS_ID"].value_counts()
dose_one_patient

959224    13
950441    12
919604    11
919633    11
937579     9
          ..
939270     1
936346     1
981406     1
956836     1
921547     1
Name: VAERS_ID, Length: 1996, dtype: int64

In [86]:
# Turn it into a dataframe
dose_one_patient_df = pd.DataFrame(dose_one_patient)
dose_one_patient_df

Unnamed: 0,VAERS_ID
959224,13
950441,12
919604,11
919633,11
937579,9
...,...
939270,1
936346,1
981406,1
956836,1


### Unqiue id/patient count for dose 1

In [88]:
# Get the dose two unique patient count
dose_two_patient = dose_two["VAERS_ID"].value_counts()
dose_two_patient

945504    7
961846    7
967681    6
975997    5
943614    5
         ..
962184    1
962182    1
964525    1
970103    1
974546    1
Name: VAERS_ID, Length: 272, dtype: int64

In [89]:
# Turn it into a dataframe
dose_two_patient_df = pd.DataFrame(dose_two_patient)
dose_two_patient_df

Unnamed: 0,VAERS_ID
945504,7
961846,7
967681,6
975997,5
943614,5
...,...
962184,1
962182,1
964525,1
970103,1
