In [261]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter

In [262]:
df = pd.read_csv('2022VAERSData.csv', encoding='cp1252', low_memory=False)
df.dropna(subset=["SYMPTOM_TEXT"], inplace=True, axis=0)
df = df.reset_index()


def is_serious(row):
    columns = ["DIED", "ER_VISIT", "HOSPITAL", "DISABLE"]
    for val in row[columns]:
        if isinstance(val, str) and val.upper() == "Y":
            return True
    return False


df["SERIOUS"] = df.apply(is_serious, axis=1)
print(f"Starting number of documents: {len(df)}")

Starting number of documents: 24711


# Deliverable 1:

## Clustering the documents

Overall goal:  
Without using the target variable, Serious, DISABLED, DIED, ER_VISIT, and HOSPITAL, and Score in data, cluster the rows of the data set into both a given number of clusters.


For this we will use only the symptom text since all other variables are discrete and easy to cluster by.


In [263]:
data = df['SYMPTOM_TEXT']
data = np.array(data)

In [264]:
stop_words = set(
    stopwords.words('english') +
    [".", ":", ";", "(", ")", ",", "#", "'", "\"", '!', '$', '%', '&', "''"])
    
porter = PorterStemmer()


def stopword_remover(lst):
    return [word for word in lst if word not in stop_words]


def stemmer(lst):
    return [porter.stem(word) for word in lst if word]


def text_preprocess(d):
    tokens = np.array([word_tokenize(i) for i in d], dtype=object)
    tok_fil = [stopword_remover(doc) for doc in tokens]
    tok_stem = [stemmer(doc) for doc in tok_fil]
    return tok_stem

In [265]:
data_processed = text_preprocess(data)

In [266]:
data[1]  # original document (SYMPTOM_TEXT column)

'SUSPECTED CLINICAL VACCINATION FAILURE; SUSPECTED COVID-19 INFECTION; This spontaneous potential legal report received from a patient concerned a patient of unspecified age and sex. Initial information was processed along with the additional information received from a regulatory authority on 30-DEC-2021. The patient\'s height, and weight were not reported. No past medical history or concurrent conditions were reported. The patient received covid-19 vaccine ad26.cov2.s (dose number in series: 1) (suspension for injection, route of admin not reported, batch number: Unknown and expiry: UNKNOWN) dose, start therapy date were not reported, administered 1 in total for prophylactic vaccination. The batch number was not reported. The Company is unable to perform follow-up to request batch/lot numbers. No concomitant medications were reported.  It was reported that on an unspecified dates, the patient caught covid twice(coded as suspected covid-19 infection and suspected clinical vaccination 

In [267]:
data_processed[
    1][:
       10]  # stems of terms from the document with punctuation and stopwords excluded


['suspect',
 'clinic',
 'vaccin',
 'failur',
 'suspect',
 'covid-19',
 'infect',
 'thi',
 'spontan',
 'potenti']

Below we create a TF-IDF matrix and use it with sklearn's implementation of KMeans Clustering to cluster our documents


In [268]:
NUM_CLUSTERS = 10

vectorizer = TfidfVectorizer(tokenizer=lambda i: i, lowercase=False)
doc_term_matrix = vectorizer.fit_transform(data_processed)
km = KMeans(n_clusters=NUM_CLUSTERS)
clusters = km.fit_transform(doc_term_matrix)

In [269]:
# we are returned a numpy sparse matrix, converting to a traditional array for simpler accessing and usage later
doc_term_matrix = doc_term_matrix.toarray()
terms = vectorizer.get_feature_names_out()
print(f"Matrix dimensions: {len(doc_term_matrix)} x {len(doc_term_matrix[0])}")
print(f"Num documents: {len(data_processed)}, Num terms: {len(terms)}")

Matrix dimensions: 24711 x 33840
Num documents: 24711, Num terms: 33840


In [270]:
print(terms[10:20])

["'bursiti" "'caller" "'chronic" "'clock" "'clonidin" "'close" "'coconut"
 "'congenit" "'contain" "'coupl"]


Example words with tf-idf values for document 0


In [271]:
words_found = 0  # limit to 10
for i in range(15000, len(doc_term_matrix[0])):
    if words_found == 10:
        break
    if doc_term_matrix[0][i] != 0:
        words_found += 1
        print(terms[i], doc_term_matrix[0][i])


date 0.05809711063342075
event 0.06976337740806901
excurs 0.09043640930109825
experi 0.11587188651559706
expir 0.2685139372913047
frequenc 0.11559960285534095
gardasil 0.18088298958406868
gender 0.10535311717275656
histori 0.055034417858322096
hpv 0.1765701887734298


doc_term_matrix is our TF-IDF matrix which is indexed by the document id (location in our original data/list of documents)\
The y axis indicies are the indices of the term in our "terms" array\
A zero value indiciates the term is not present in the document, a non zero value is the calculated tf-idf value


In [272]:
clusters

array([[1.06443945, 0.98884325, 1.15263573, ..., 1.01871408, 0.94568563,
        1.28700389],
       [1.04504496, 0.97976196, 1.12499139, ..., 0.94298708, 0.97544805,
        1.28350271],
       [1.04974378, 0.97752136, 1.1295142 , ..., 0.95830917, 0.9763275 ,
        1.30025566],
       ...,
       [1.06446225, 0.99513667, 1.16119323, ..., 1.10732054, 1.0094543 ,
        1.29931102],
       [1.06465314, 0.99481724, 1.16135119, ..., 1.10732442, 1.0099334 ,
        1.30052226],
       [1.06580423, 0.99644185, 1.16168644, ..., 1.10920895, 1.01215587,
        1.30308977]])

In [273]:
clusters[0]  # distances from document 0 to each of the 10 cluster centers

array([1.06443945, 0.98884325, 1.15263573, 0.94705469, 1.0438153 ,
       1.34657479, 1.0166167 , 1.01871408, 0.94568563, 1.28700389])

"clusters" is our transformed matrix, there are rows for each document as before, but there are now 10 (chosen number of clusters) integer values which are the L2 norm distance (Euclidian distance) to the center of each of the (0,1,...9) clusters.


In [274]:
predictions = km.predict(doc_term_matrix)

The predicitions variable is an array of our documents with the cluster it is in. Below is an example of the output.


In [275]:
for doc in range(0, 5):
    print(f"Document {doc} belongs to cluster: {predictions[doc]}")
    print(f"Document text: {data_processed[doc][:10]}")

Document 0 belongs to cluster: 8
Document text: ['thi', 'spontan', 'report', 'receiv', 'pharmacist', 'refer', 'patient', 'unknown', 'age', 'gender']
Document 1 belongs to cluster: 3
Document text: ['suspect', 'clinic', 'vaccin', 'failur', 'suspect', 'covid-19', 'infect', 'thi', 'spontan', 'potenti']
Document 2 belongs to cluster: 3
Document text: ['suspect', 'clinic', 'vaccin', 'failur', 'suspect', 'covid-19', 'infect', 'thi', 'spontan', 'report']
Document 3 belongs to cluster: 7
Document text: ['irregular', 'menstrual', 'cycl', 'period', 'extrem', 'heavi', 'bad', 'cramp', 'thi', 'spontan']
Document 4 belongs to cluster: 7
Document text: ['breakthrough', 'heavi', 'period', 'longer', 'period', 'length', 'thi', 'spontan', 'report', 'receiv']


# Deliverable 3:

## Classifying the generated clusters


In [276]:
# row.name gives the index in the dataframe, predictions[document_id] gives us the cluster
# adding a cluster column to our dataframe
df["CLUSTER"] = df.apply(lambda row: predictions[row.name], axis=1)

In [277]:
df[["SYMPTOM_TEXT", "SERIOUS", "CLUSTER"]][:1000:100]

Unnamed: 0,SYMPTOM_TEXT,SERIOUS,CLUSTER
0,This spontaneous report was received from a ph...,False,8
100,got an ear infection on both ears with perfora...,False,7
200,This case involves a 27 years old male patient...,False,3
300,"Chills, fatigue, night sweats, fever, congesti...",False,4
400,"pain, followed by swelling,loss of arm movemen...",False,6
500,"Fever of 100.6F, headache, body ache and chills",False,4
600,I am a 63-year-old male. I received the Modern...,False,0
700,The patient received his first dose of the Jan...,False,1
800,"hives, treated with benadryl",False,1
900,Stabbing Chest pains from a few hours after sh...,False,6


In [278]:
serious_docs = df.loc[df["SERIOUS"] == True]
serious_docs[["SYMPTOM_TEXT", "SERIOUS", "CLUSTER"]][:10]

Unnamed: 0,SYMPTOM_TEXT,SERIOUS,CLUSTER
235,After vaccine I am experiencing numbness in my...,True,0
236,Bilateral large pulmonary emboli with right ve...,True,1
258,Multiple episodes of syncope resulting in faci...,True,1
289,I experienced a pounding heartbeat and chest t...,True,0
308,12/27 - chills and body aches. Light pain in h...,True,4
321,2/04/2021 Experienced stabbing R peri-orbital ...,True,1
324,"December 28, 2021 - Moderna 1 January 30, 2021...",True,0
325,"On April 12 2021, I suddenly heard a loud nois...",True,1
327,"Fatigue, nausea, vomiting, vertigo, brain fog",True,4
334,"chest pain on 12/20/21 intermittently, then co...",True,1


In [279]:
print(f"Number of serious cases: {len(serious_docs)}")

Number of serious cases: 4332


### Analyzing the clusters by count of serious cases


In [326]:
init_counts = dict()
for cluster_num in range(NUM_CLUSTERS):
    init_counts[cluster_num] = 0
cluster_serious_counts = Counter(init_counts)
# initializing counter in case we have a cluster that does not appear, we would want a count of 0
# and not have the cluster be undefined in the counter

cluster_serious_counts.update(
    serious_docs["CLUSTER"])  # count how many times each cluster appears

cluster_non_serious_counts = Counter(
    df["CLUSTER"]) - cluster_serious_counts  # total count - serious

cluster_percent_serious = dict()

for cluster, serious_count in cluster_serious_counts.items():
    non_serious_count = cluster_non_serious_counts[cluster]
    percent_serious = (serious_count /
                       (serious_count + non_serious_count)) * 100
    cluster_percent_serious[cluster] = percent_serious
    print(
        f"Cluster {cluster}: {serious_count} serious cases {non_serious_count} non serious cases ({percent_serious:.2f}% serious)"
    )

Cluster 0: 264 serious cases 2094 non serious cases (11.20% serious)
Cluster 1: 3073 serious cases 5489 non serious cases (35.89% serious)
Cluster 2: 350 serious cases 385 non serious cases (47.62% serious)
Cluster 3: 181 serious cases 2650 non serious cases (6.39% serious)
Cluster 4: 155 serious cases 1211 non serious cases (11.35% serious)
Cluster 5: 0 serious cases 841 non serious cases (0.00% serious)
Cluster 6: 164 serious cases 2306 non serious cases (6.64% serious)
Cluster 7: 124 serious cases 2509 non serious cases (4.71% serious)
Cluster 8: 21 serious cases 2639 non serious cases (0.79% serious)
Cluster 9: 0 serious cases 255 non serious cases (0.00% serious)


From the cluster results, we have clusters 5 and 9 with no cases, and cluster 1 with most by a large margin\
Cluster 1 however, was our largest cluster as well


### Finding average age for each cluster, analyzing sex frequency


In [346]:
cluster_age_sum = dict()
cluster_ages_found = dict()
cluster_sex_count = dict()  # ignoring rows with value U or N/A
cluster_avg_age = dict()

for cluster in range(NUM_CLUSTERS):
    cluster_age_sum[cluster] = 0
    cluster_ages_found[cluster] = 0
    cluster_avg_age[cluster] = None
    cluster_sex_count[cluster] = {"M": 0, "F": 0}

for sex, cluster in zip(df["SEX"], df["CLUSTER"]):
    if isinstance(sex, str):
        cluster_ages_found[cluster] += 1
        if sex == "M":
            cluster_sex_count[cluster]["M"] += 1
        elif sex == "F":
            cluster_sex_count[cluster]["F"] += 1

for age, cluster in zip(df["AGE_YRS"], df["CLUSTER"]):
    if isinstance(age, (int, float)) and not np.isnan(age):
        cluster_ages_found[cluster] += 1
        cluster_age_sum[cluster] += age

for cluster, age_sum in cluster_age_sum.items():
    cluster_avg_age[cluster] = age_sum / cluster_ages_found[cluster]

cluster_sex_pct = dict()

for cluster, sex_counts in cluster_sex_count.items():
    m_count, f_count = sex_counts.values()
    total = m_count+f_count
    cluster_sex_pct[cluster] = {"M": (m_count/total) * 100, "F": (f_count/total)* 100}

for cluster in range(NUM_CLUSTERS):
    pct_male = cluster_sex_pct[cluster]["M"] 
    pct_female = cluster_sex_pct[cluster]["F"]
    print(f"Cluster {cluster}: Average age {cluster_avg_age[cluster]:.2f}, {pct_male:.2f}% M, {pct_female:.2f}% F")

Cluster 0: Average age 22.81, 23.66% M, 76.34% F
Cluster 1: Average age 23.93, 41.55% M, 58.45% F
Cluster 2: Average age 27.66, 48.82% M, 51.18% F
Cluster 3: Average age 16.78, 40.71% M, 59.29% F
Cluster 4: Average age 23.11, 33.41% M, 66.59% F
Cluster 5: Average age 18.81, 63.86% M, 36.14% F
Cluster 6: Average age 22.25, 28.37% M, 71.63% F
Cluster 7: Average age 18.44, 34.11% M, 65.89% F
Cluster 8: Average age 16.37, 46.44% M, 53.56% F
Cluster 9: Average age 14.89, 46.85% M, 53.15% F


From this analysis, we can see that cluster 9 has the lowest average age, and cluster 2 has the highest average age.\
As a proportion of total cases for that cluster, cluster 9 has the lowest(0%) and cluster 2 has the highest(47.62%) of cases that were deemed serious.\
Clusters 0, 4, 6, 7 heavily leaned female, cluster 5 leaned male, and clusters 1,2,3,8,9 were a close split
Based on these findings below is a generated description for each cluster considering these factors

In [350]:
for cluster in range(NUM_CLUSTERS):
  description = ""

  avg_age = cluster_avg_age[cluster]
  if avg_age < 15: 
    description += "Young population, "
  elif avg_age > 25:
    description += "Older population, "
    
  pct_male, pct_female = cluster_sex_pct[cluster].values()
  
  if abs(pct_male - pct_female) > 10:
    if pct_male > pct_female:
      description += "Mostly Males, "
    else:
      description += "Mostly Females, "
  else:
    description += "Males and Females, "
    
  pct_serious = cluster_percent_serious[cluster]

  if pct_serious == 0:
    description += "No serious cases"
  elif pct_serious < 10:
    description += "Few serious cases"
  elif pct_serious > 30:
    description += "Many serious cases"
  else:
    description += "Some serious cases"
  print(f"Cluster {cluster}: {description}")

Cluster 0: Mostly Females, Some serious cases
Cluster 1: Mostly Females, Many serious cases
Cluster 2: Older population, Males and Females, Many serious cases
Cluster 3: Mostly Females, Few serious cases
Cluster 4: Mostly Females, Some serious cases
Cluster 5: Mostly Males, No serious cases
Cluster 6: Mostly Females, Few serious cases
Cluster 7: Mostly Females, Few serious cases
Cluster 8: Males and Females, Few serious cases
Cluster 9: Mostly children, Males and Females, No serious cases
