In [70]:
import pandas as pd
import numpy as np
#from scipy.linalg import svd
from sklearn.decomposition import randomized_svd

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import gensim.corpora as corpora


Initializing our dataframe and adding the serious column


In [71]:
df = pd.read_csv('2022VAERSData.csv', encoding='cp1252', low_memory=False)
df.dropna(subset=["SYMPTOM_TEXT"], inplace=True, axis=0)
df = df.reset_index()


def is_serious(row):
    columns = ["DIED", "ER_VISIT", "HOSPITAL", "DISABLE"]
    for val in row[columns]:
        if isinstance(val, str) and val.upper() == "Y":
            return True
    return False


df["SERIOUS"] = df.apply(is_serious, axis=1)
documents = df["SYMPTOM_TEXT"]

In [72]:
stop_words = set(
    stopwords.words('english') + [
        ".", ":", ";", "(", ")", ",", "#", "'", "\"", '!', '?', '$', '%', '&',
        "''", "``"
    ])

porter = PorterStemmer()


def stopword_remover(lst):
    return [word for word in lst if word not in stop_words]


def stemmer(lst):
    return [porter.stem(word) for word in lst if word]


def text_preprocess(d):
    tokens = np.array([word_tokenize(i) for i in d], dtype=object)
    tok_fil = [stopword_remover(doc) for doc in tokens]
    tok_stem = [stemmer(doc) for doc in tok_fil]
    return tok_stem


documents = text_preprocess(
    documents)  # remove stopwords and punctuation, convert words to stems


In [73]:
vectorizer = TfidfVectorizer(tokenizer=lambda i: i, lowercase=False)
doc_term_matrix = vectorizer.fit_transform(documents).todense()
vocab = np.array(vectorizer.get_feature_names_out())
vocab[20000:20010]

array(['hug', 'huge', 'hugh', 'hum', 'humalog', 'human', 'humera',
       'humeru', 'humidifi', 'humili'], dtype=object)

In [74]:
u, s, v = randomized_svd(doc_term_matrix, 10, random_state=None)

In [75]:
def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-15:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [76]:
show_topics(v)  # showing top 15 words from all(10) topics


['report patient vaccin unknown the covid-19 dose receiv medic i inform no number pain',
 'vendor mobil might dure record excurs review intern store temperatur found depart possibl health',
 'i day pain bodi arm start fever headach chest sever hour ach left symptom',
 'covid breakthrough posit case hospit test vaccin + admit pt fulli cough infect contact',
 'expir vaccin moderna administ given use mrna-1273 i dose on pfizer beyond vial product',
 'pain fever arm site headach ach chill inject bodi fatigu sore chest swell left',
 'pfizer fj6369 given system staff ok instruct use temp excurs lot compani hospit pt',
 'breakthrough case report arm pain fj6369 pfizer site inject system ok staff instruct temp',
 'hospit patient breakthrough 12/22/21 beyond the date vaccin admit receiv bnt162b2 left number she',
 'expir date beyond administ use 12/22/21 posit limit test vaccin covid bnt162b2 vial onset']

s is the matrix topics x topics, size 10 since we have 10 topics and full_matrics=False, so 1x10 matrix instead of 10x10


In [77]:
s

array([36.35022468, 28.42038069, 22.66211708, 19.03169146, 17.31889477,
       15.66225608, 15.57815332, 15.13162715, 14.01870079, 13.55867966])

u is documents(rows) x topics(columns)


In [78]:
u

array([[ 8.96332425e-03,  1.11123435e-03, -5.50609241e-03, ...,
         8.52228002e-04,  2.23880998e-03,  9.42025166e-03],
       [ 1.25807046e-02, -1.61937544e-03, -6.33537213e-03, ...,
        -2.59343262e-04,  1.81026288e-03, -4.11562952e-03],
       [ 1.29308344e-02, -1.55531577e-03, -5.93372301e-03, ...,
        -2.06384490e-03, -3.49223944e-03, -6.55984712e-03],
       ...,
       [ 2.34207995e-03,  6.66207623e-05,  5.53260293e-04, ...,
        -5.38335780e-04,  1.29276349e-03,  9.50055073e-04],
       [ 2.31824661e-03,  6.93984443e-05,  5.75257316e-04, ...,
        -6.75411287e-04,  1.38467392e-03,  9.39632150e-04],
       [ 2.13217731e-03,  6.06381065e-05,  5.04426211e-04, ...,
        -4.91462805e-04,  1.18183697e-03,  8.67842849e-04]])

v is topic(rows) x terms(columns) matrix


In [79]:
v

array([[ 2.05755127e-05,  6.02442537e-05,  4.96746505e-05, ...,
         4.44020726e-04,  1.04282457e-05,  2.42914353e-04],
       [-3.94943944e-06,  5.76660773e-06, -8.31556247e-06, ...,
        -7.13623335e-05, -1.26397055e-06, -4.57448940e-05],
       [ 2.03062653e-05,  1.20711808e-04, -1.98981073e-05, ...,
        -2.63774827e-04, -2.03452006e-06,  1.45435045e-04],
       ...,
       [ 2.12192457e-05,  3.53102979e-05,  1.47082223e-05, ...,
        -5.70239603e-05, -1.53268218e-06,  1.26146932e-04],
       [ 1.79358521e-05,  7.13811846e-05,  2.35971672e-08, ...,
        -4.22075276e-04, -5.31185356e-06,  1.76514767e-04],
       [ 1.08961129e-05,  8.45424325e-05,  4.02935146e-05, ...,
        -7.44758470e-04, -8.88324612e-06,  6.98229343e-05]])

## Latent Dirichlet Allocation


In [80]:
from gensim.utils import simple_preprocess


def sent_to_words(sentences):
    for sentence in sentences:
        yield (simple_preprocess(str(sentence), deacc=True))


def remove_stopwords(texts):
    return [[
        word for word in simple_preprocess(str(doc)) if word not in stop_words
    ] for doc in texts]


data = documents
data_words = list(sent_to_words(data))
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['thi', 'spontan', 'report', 'receiv', 'pharmacist', 'refer', 'patient', 'unknown', 'age', 'gender', 'inform', 'regard', 'patient', 'medic', 'histori', 'concurr', 'condit', 'concomit', 'medic', 'provid', 'dec', 'patient', 'vaccin', 'expir', 'hpv', 'rl', 'vlp', 'vaccin', 'yeast', 'gardasil']


In [90]:
id2word = corpora.Dictionary(data_words)
corpus = [id2word.doc2bow(text) for text in data_words]

In [82]:
from gensim.models import LdaMulticore

lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=10)

doc_lda = lda_model[corpus]
lda_model.print_topics()

[(0,
  '0.019*"vaccin" + 0.014*"covid" + 0.013*"day" + 0.009*"test" + 0.009*"pain" + 0.008*"patient" + 0.008*"arm" + 0.007*"posit" + 0.007*"possibl" + 0.007*"report"'),
 (1,
  '0.026*"vaccin" + 0.020*"patient" + 0.017*"report" + 0.016*"day" + 0.015*"arm" + 0.012*"pain" + 0.011*"dose" + 0.011*"receiv" + 0.010*"unknown" + 0.009*"covid"'),
 (2,
  '0.032*"vaccin" + 0.030*"report" + 0.029*"patient" + 0.027*"covid" + 0.020*"unknown" + 0.014*"receiv" + 0.013*"dose" + 0.011*"medic" + 0.010*"moderna" + 0.010*"mrna"'),
 (3,
  '0.029*"patient" + 0.019*"vaccin" + 0.018*"pain" + 0.017*"report" + 0.015*"covid" + 0.010*"non" + 0.009*"outcom" + 0.009*"receiv" + 0.008*"medic" + 0.008*"unknown"'),
 (4,
  '0.015*"covid" + 0.015*"headach" + 0.012*"pain" + 0.011*"ach" + 0.011*"vaccin" + 0.009*"day" + 0.009*"bodi" + 0.009*"report" + 0.008*"fever" + 0.007*"start"'),
 (5,
  '0.041*"patient" + 0.034*"vaccin" + 0.031*"report" + 0.020*"covid" + 0.019*"unknown" + 0.018*"dose" + 0.015*"receiv" + 0.012*"medic" + 0.

In [83]:
for doc in range(5):
    print(f"Document {doc}: ", end="")
    for topic, prob in doc_lda[doc]:
        print(f"Topic: {topic} Probability: {prob*100:.2f}% ", end="")
    print()


Document 0: Topic: 5 Probability: 98.55% 
Document 1: Topic: 2 Probability: 38.56% Topic: 5 Probability: 61.02% 
Document 2: Topic: 2 Probability: 38.78% Topic: 5 Probability: 60.68% 
Document 3: Topic: 0 Probability: 9.58% Topic: 8 Probability: 43.64% Topic: 9 Probability: 46.38% 
Document 4: Topic: 9 Probability: 99.27% 


In [84]:
import pyLDAvis.gensim_models
import pyLDAvis
import warnings

warnings.filterwarnings('ignore')

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
LDAvis_prepared

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [85]:
def get_topic(row):
    topics_for_row = doc_lda[row.name]
    topics_for_row.sort(key=lambda a: -a[1])
    topic, highest_prob = topics_for_row[0]
    return topic

df["TOPIC"] = df.apply(get_topic, axis=1)

In [86]:
df[["SYMPTOM_TEXT", "TOPIC"]][0:10]

Unnamed: 0,SYMPTOM_TEXT,TOPIC
0,This spontaneous report was received from a ph...,5
1,SUSPECTED CLINICAL VACCINATION FAILURE; SUSPEC...,5
2,SUSPECTED CLINICAL VACCINATION FAILURE; SUSPEC...,5
3,Irregular menstrual cycle.; period extremely h...,9
4,Breakthrough and heavy periods; longer period ...,9
5,tired; Patient reports feeling achy; chills; t...,9
6,diarrheal discharge of both goop and intestina...,7
7,Now Lymph node swelling in armpit closest to j...,9
8,my torso was covered with a rash/rash spreadof...,9
9,pyrexia; This is a spontaneous report received...,9


In [87]:
from collections import Counter, defaultdict
topics = Counter(df["TOPIC"])
topics_serious = defaultdict(int)

for index, row in df.iterrows():
  if row["SERIOUS"]:
    topics_serious[row["TOPIC"]] += 1

topics = sorted(([(topic, count) for topic, count in topics.items()]))
topics_serious = sorted(([(topic, count) for topic, count in topics_serious.items()]))

print(topics)
print(topics_serious)


[(0, 2310), (1, 2348), (2, 1526), (3, 1998), (4, 3161), (5, 2751), (6, 4070), (7, 581), (8, 2194), (9, 3772)]
[(0, 350), (1, 136), (2, 394), (3, 793), (4, 981), (5, 203), (6, 445), (7, 197), (8, 629), (9, 204)]


In [88]:
topic_serious_pct = {}
for (topic, total_count), (_, serious_count) in zip(topics, topics_serious):
  topic_serious_pct[topic] = (serious_count/total_count) * 100

for topic, pct in topic_serious_pct.items():
  print(f"Topic {topic}: {pct:.2f}% serious")

Topic 0: 15.15% serious
Topic 1: 5.79% serious
Topic 2: 25.82% serious
Topic 3: 39.69% serious
Topic 4: 31.03% serious
Topic 5: 7.38% serious
Topic 6: 10.93% serious
Topic 7: 33.91% serious
Topic 8: 28.67% serious
Topic 9: 5.41% serious
