In [1]:
import os, sys, glob
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

In [3]:
os.environ['PATH'] = '../Code'
sys.path.append('../Code/NTU-FN6903-Project')

In [4]:
pd.set_option('display.max_columns', None)

# Merge Datasets into One

In [5]:
# list of data files
DATA_DIR = '../Data'

# files in DATA_DIR
filetype_lst = ['DATA', 'VAX', 'SYMPTOMS']

# create dictionary of 3 dataframes
# each dataframe combines the "DATA"/"VAX"/"SYMPTOMS" of years 2020-2022
df_dict = {
    ft: pd.concat(
        [pd.read_csv(
            f,
            encoding="ISO-8859-1", 
            low_memory=False
        ) for f in glob.glob(os.path.join(DATA_DIR, f'*{ft}.csv'))
        ],
        ignore_index=True
    ) for ft in filetype_lst
}

In [6]:
# types of vaccinations from the VAX dataframe
# (df_dict['VAX'].VAX_TYPE.value_counts()/df_dict['VAX'].shape[0]).head(10)

#COVID19      0.848469
#VARZOS       0.032324
#FLU4         0.016010
#UNK          0.015465
#COVID19-2    0.013953
#PPV          0.004970
#FLUX         0.004820
#VARCEL       0.004585
#HPV9         0.004405
#TDAP         0.004321

In [7]:
# join symptoms into continuous string
df_dict['SYMPTOMS']['SYMP_STR'] = (
    df_dict['SYMPTOMS']
    .drop(columns=df_dict['SYMPTOMS'].filter(regex='VERSION'))
    .filter(regex='SYMPTOM')
    .apply(lambda s: '|'.join(filter(pd.notna, s)), axis=1)
)

# combine symptoms for each unique VAERS ID
df_dict['SYMPTOMS'] = (
    df_dict['SYMPTOMS']
    .groupby('VAERS_ID')
    .agg({'SYMP_STR': '|'.join})[
        'SYMP_STR'
    ]
    .str.split('|')
    .apply(lambda s: ', '.join(set(s)))
    .to_frame()
    .reset_index()
)

In [8]:
# keep only the COVID19 vaccinations 
# merge VAX data with other dataframes on VAERS_ID (identifies unique patient)
df_merged = (
    df_dict['VAX'].loc[df_dict['VAX']['VAX_TYPE'] == 'COVID19']
    .merge(df_dict['DATA'], how='left', on='VAERS_ID')
    .merge(df_dict['SYMPTOMS'], how='left', on='VAERS_ID')
    .sort_values('VAERS_ID')
)

# Preprocess Numeric and Categorical Variables

In [9]:
#COLS_DROP = """VAX_TYPE, VAX_LOT, VAX_DOSE_SERIES, VAX_ROUTE, VAX_SITE, VAX_NAME, STATE, CAGE_YR, CAGE_MO, CUR_ILL, LAB_DATA, V_ADMINBY, V_FUNDBY, SPLTTYPE, FORM_VERS, OFC_VISIT, ER_ED_VISIT""".split(', ')
#COLS_BINARY = """DIED, L_THREAT, ER_VISIT, HOSPITAL, X_STAY, DISABLE, BIRTH_DEFECT""".split(', ')
COLS_DATE = """RECVDATE, RPT_DATE, DATEDIED, VAX_DATE, ONSET_DATE, TODAYS_DATE""".split(', ')
#COLS_TEXT = """HISTORY, ALLERGIES""".split(', ')

In [10]:
# reformat dates
df_merged[COLS_DATE] = df_merged[COLS_DATE].apply(pd.to_datetime, format='%m/%d/%Y', errors='coerce')

# drop duplicated entries
df_merged = df_merged.drop_duplicates().reset_index(drop=True)


# preprocess numeric variables

# flag errors in vax_date and onset_date
df_merged.loc[
    (df_merged['ONSET_DATE'] < pd.to_datetime('2020-01-01')) |
    (df_merged['VAX_DATE'] < pd.to_datetime('2020-01-01')) | 
    ((df_merged['ONSET_DATE'] - df_merged['VAX_DATE']).astype('timedelta64[D]') != df_merged['NUMDAYS']),
    ['VAX_DATE', 'ONSET_DATE']
] = pd.NaT

# flag errors in numdays
df_merged.loc[
    df_merged['ONSET_DATE'].isna() |
    df_merged['VAX_DATE'].isna(),
    'NUMDAYS'
] = np.nan

# drop ALL rows with na values in either numdays or age
df_merged.dropna(subset=['NUMDAYS', 'AGE_YRS', 'SYMP_STR'], inplace=True)


# preprocess categorical variables

# VAX_MANU
df_merged['MANU_PFIZER'] = np.where(df_merged['VAX_MANU'] == 'PFIZER\BIONTECH', 1, 0)
df_merged['MANU_MODERNA'] = np.where(df_merged['VAX_MANU'] == 'MODERNA', 1, 0)
df_merged['MANU_JANSSEN'] = np.where(df_merged['VAX_MANU'] == 'JANSSEN', 1, 0)
df_merged['MANU_NOVAVAX'] = np.where(df_merged['VAX_MANU'] == 'NOVAVAX', 1, 0)

# GENDER
df_merged['GENDER_F'] = np.where(df_merged['SEX']=='F', 1, 0)
df_merged['GENDER_M'] = np.where(df_merged['SEX']=='M', 1, 0)

# BIRTH_DEFECT
df_merged['DEFECT'] = np.where(df_merged['BIRTH_DEFECT'] == 'Y', 1, 0)

# HOSPITAL (response variable)
df_merged['HOSP'] = np.where(df_merged['HOSPITAL'] == 'Y', 1, 0)

# Preprocess 'HISTORY' strings

In [11]:
# replace na values in medical history
df_merged['HISTORY'] = df_merged['HISTORY'].str.lower().str.strip()
df_merged.loc[df_merged['HISTORY'].isna(), 'HISTORY'] = 'no known medical history'

In [12]:
## sBERT model
# model = SentenceTransformer('pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb')

In [13]:
## identify cases of no known medical history cases
#nkmh_phrases_count = df_merged.loc[df_merged['HISTORY'].astype(str).apply(len) <= 1000, 'HISTORY'].value_counts()
#nkmh_phrases = list(nkmh_phrases_count[nkmh_phrases_count>=5].index)

#nkmh_phrases_embeddings = model.encode(nkmh_phrases, show_progress_bar=True)

In [14]:
## obtain embeddings for reference phrases
#nkmh_references_embeddings = model.encode([
#    'no known medical history',
#    'none reported',
#    'unknown',
#    'no adverse',
#    'unaware',
#    'none'
#], show_progress_bar=True)

## normalise embeddings of reference phrases
#nkmh_references_embeddings = nkmh_references_embeddings / np.linalg.norm(nkmh_references_embeddings, axis=1)[:, None]

## cosine similarities of all embeddings to reference phrases
#nkmh_cos_sim = np.matmul(nkmh_references_embeddings, (nkmh_phrases_embeddings / np.linalg.norm(nkmh_phrases_embeddings, axis=1)[:, None]).T)

## rank by embeddings by cosine similarity to reference phrases to identify all synomymous phrases
#(
#    pd.DataFrame({
#        'Phrase': nkmh_phrases,
#        'Cosine Similarity with NKMH': np.max(nkmh_cos_sim, axis=0)
#    })
#    .sort_values('Cosine Similarity with NKMH', ascending=False)
#    .to_excel(os.path.join(DATA_DIR, 'nkmh_phrases.xlsx'))
#)

In [15]:
# extract synonymous phrases to nkmh using annotated excel file
# file is generated by the above code and annotated manually
nkmh_df = pd.read_excel(os.path.join(DATA_DIR, 'nkmh_phrases_annotated.xlsx'), keep_default_na=False)
HISTORY_NKMH_PHRASES = nkmh_df.loc[nkmh_df['NKMH']=='Y', 'Phrase'].unique()

In [16]:
# then rename nkmh phrases
df_merged.loc[df_merged['HISTORY'].isin(HISTORY_NKMH_PHRASES), 'HISTORY'] = 'no known medical history'

# Train-Test Split, then Save Datasets

In [17]:
df_merged = df_merged.sort_values(['ONSET_DATE', 'VAERS_ID'])

In [18]:
df_merged.shape

(802876, 51)

In [19]:
COLS_X = 'ONSET_DATE, NUMDAYS, AGE_YRS, GENDER_F, GENDER_M, MANU_PFIZER, MANU_MODERNA, MANU_JANSSEN, MANU_NOVAVAX, DEFECT, HISTORY, SYMP_STR'.split(', ')
COLS_Y = 'HOSP'.split(', ')

X, y = df_merged[COLS_X], df_merged[COLS_Y]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, random_state=55, shuffle=False)

In [20]:
# save files
#X_train.to_csv(os.path.join(DATA_DIR, 'X_train.csv'))
#y_train.to_csv(os.path.join(DATA_DIR, 'y_train.csv'))

In [21]:
#X_test.to_csv(os.path.join(DATA_DIR, 'X_test.csv'))
#y_test.to_csv(os.path.join(DATA_DIR, 'y_test.csv'))