In [177]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter
from scipy.stats import chisquare

In [178]:
df = pd.read_csv('2022VAERSData.csv', encoding='cp1252', low_memory=False)
df.dropna(subset=["SYMPTOM_TEXT"], inplace=True, axis=0)
df = df.reset_index()


def is_serious(row):
    columns = ["DIED", "ER_VISIT", "HOSPITAL", "DISABLE"]
    for val in row[columns]:
        if isinstance(val, str) and val.upper() == "Y":
            return True
    return False


df["SERIOUS"] = df.apply(is_serious, axis=1)
print(f"Starting number of documents: {len(df)}")

Starting number of documents: 24711


# Deliverable 1:

## Clustering the documents

Overall goal:  
Without using the target variable, Serious, DISABLED, DIED, ER_VISIT, and HOSPITAL, and Score in data, cluster the rows of the data set into both a given number of clusters.


For this we will use only the symptom text since all other variables are discrete and easy to cluster by.


In [179]:
data = df['SYMPTOM_TEXT']
data = np.array(data)

In [180]:
stop_words = set(
    stopwords.words('english') +
    [".", ":", ";", "(", ")", ",", "#", "'", "\"", '!', '$', '%', '&', "''"])

porter = PorterStemmer()


def stopword_remover(lst):
    return [word for word in lst if word not in stop_words]


def stemmer(lst):
    return [porter.stem(word) for word in lst if word]


def text_preprocess(d):
    tokens = np.array([word_tokenize(i) for i in d], dtype=object)
    tok_fil = [stopword_remover(doc) for doc in tokens]
    tok_stem = [stemmer(doc) for doc in tok_fil]
    return tok_stem

In [181]:
data_processed = text_preprocess(data)

In [182]:
data[1]  # original document (SYMPTOM_TEXT column)

'SUSPECTED CLINICAL VACCINATION FAILURE; SUSPECTED COVID-19 INFECTION; This spontaneous potential legal report received from a patient concerned a patient of unspecified age and sex. Initial information was processed along with the additional information received from a regulatory authority on 30-DEC-2021. The patient\'s height, and weight were not reported. No past medical history or concurrent conditions were reported. The patient received covid-19 vaccine ad26.cov2.s (dose number in series: 1) (suspension for injection, route of admin not reported, batch number: Unknown and expiry: UNKNOWN) dose, start therapy date were not reported, administered 1 in total for prophylactic vaccination. The batch number was not reported. The Company is unable to perform follow-up to request batch/lot numbers. No concomitant medications were reported.  It was reported that on an unspecified dates, the patient caught covid twice(coded as suspected covid-19 infection and suspected clinical vaccination 

In [183]:
data_processed[1][:10]
# stems of terms from the document with punctuation and stopwords excluded

['suspect',
 'clinic',
 'vaccin',
 'failur',
 'suspect',
 'covid-19',
 'infect',
 'thi',
 'spontan',
 'potenti']

Below we create a TF-IDF matrix and use it with sklearn's implementation of KMeans Clustering to cluster our documents


In [184]:
NUM_CLUSTERS = 10

vectorizer = TfidfVectorizer(tokenizer=lambda i: i, lowercase=False)
doc_term_matrix = vectorizer.fit_transform(data_processed)
km = KMeans(n_clusters=NUM_CLUSTERS)
clusters = km.fit_transform(doc_term_matrix)

In [185]:
# we are returned a numpy sparse matrix, converting to a traditional array for simpler accessing and usage later
doc_term_matrix = doc_term_matrix.toarray()
terms = vectorizer.get_feature_names_out()
print(f"Matrix dimensions: {len(doc_term_matrix)} x {len(doc_term_matrix[0])}")
print(f"Num documents: {len(data_processed)}, Num terms: {len(terms)}")

Matrix dimensions: 24711 x 33840
Num documents: 24711, Num terms: 33840


In [186]:
print(terms[10:20])

["'bursiti" "'caller" "'chronic" "'clock" "'clonidin" "'close" "'coconut"
 "'congenit" "'contain" "'coupl"]


Example words with tf-idf values for document 0


In [187]:
words_found = 0  # limit to 10
for i in range(15000, len(doc_term_matrix[0])):
    if words_found == 10:
        break
    if doc_term_matrix[0][i] != 0:
        words_found += 1
        print(terms[i], doc_term_matrix[0][i])


date 0.05809711063342075
event 0.06976337740806901
excurs 0.09043640930109825
experi 0.11587188651559706
expir 0.2685139372913047
frequenc 0.11559960285534095
gardasil 0.18088298958406868
gender 0.10535311717275656
histori 0.055034417858322096
hpv 0.1765701887734298


doc_term_matrix is our TF-IDF matrix which is indexed by the document id (location in our original data/list of documents)\
The y axis indicies are the indices of the term in our "terms" array\
A zero value indiciates the term is not present in the document, a non zero value is the calculated tf-idf value


In [188]:
clusters

array([[1.01283789, 0.93808449, 1.34471716, ..., 0.99573712, 1.06374519,
        1.17247838],
       [0.99068622, 0.96628939, 1.32328094, ..., 0.99112723, 1.04435309,
        1.14216012],
       [0.98787647, 0.96700522, 1.32333229, ..., 0.98329058, 1.04851699,
        1.13003186],
       ...,
       [1.02188448, 1.0046347 , 1.35000376, ..., 0.9999387 , 1.06384463,
        1.18307735],
       [1.02059458, 1.00512282, 1.35011876, ..., 1.00002143, 1.0639855 ,
        1.18331655],
       [1.0233707 , 1.00743541, 1.35036285, ..., 1.00093077, 1.06520125,
        1.18382416]])

In [189]:
clusters[0]  # distances from document 0 to each of the 10 cluster centers

array([1.01283789, 0.93808449, 1.34471716, 1.34657479, 1.00192982,
       1.28700389, 0.96232353, 0.99573712, 1.06374519, 1.17247838])

"clusters" is our transformed matrix, there are rows for each document as before, but there are now 10 (chosen number of clusters) integer values which are the L2 norm distance (Euclidian distance) to the center of each of the (0,1,...9) clusters.


In [190]:
predictions = km.predict(doc_term_matrix)

The predicitions variable is an array of our documents with the cluster it is in. Below is an example of the output.


In [191]:
for doc in range(0, 5):
    print(f"Document {doc} belongs to cluster: {predictions[doc]}")
    print(f"Document text: {data_processed[doc][:10]}")

Document 0 belongs to cluster: 1
Document text: ['thi', 'spontan', 'report', 'receiv', 'pharmacist', 'refer', 'patient', 'unknown', 'age', 'gender']
Document 1 belongs to cluster: 4
Document text: ['suspect', 'clinic', 'vaccin', 'failur', 'suspect', 'covid-19', 'infect', 'thi', 'spontan', 'potenti']
Document 2 belongs to cluster: 6
Document text: ['suspect', 'clinic', 'vaccin', 'failur', 'suspect', 'covid-19', 'infect', 'thi', 'spontan', 'report']
Document 3 belongs to cluster: 4
Document text: ['irregular', 'menstrual', 'cycl', 'period', 'extrem', 'heavi', 'bad', 'cramp', 'thi', 'spontan']
Document 4 belongs to cluster: 4
Document text: ['breakthrough', 'heavi', 'period', 'longer', 'period', 'length', 'thi', 'spontan', 'report', 'receiv']


# Deliverable 2:
Using the clusters and the frequency of serious cases within each cluster, we will test whether being in a cluster has a statistical significance of the document being serious. The null hypothesis is that there is no relationship. 

In [192]:
# row.name gives the index in the dataframe, predictions[document_id] gives us the cluster
# adding a cluster column to our dataframe
df["CLUSTER"] = df.apply(lambda row: predictions[row.name], axis=1)
df[["SYMPTOM_TEXT", "SERIOUS", "CLUSTER"]][:1000:100]

Unnamed: 0,SYMPTOM_TEXT,SERIOUS,CLUSTER
0,This spontaneous report was received from a ph...,False,1
100,got an ear infection on both ears with perfora...,False,4
200,This case involves a 27 years old male patient...,False,6
300,"Chills, fatigue, night sweats, fever, congesti...",False,7
400,"pain, followed by swelling,loss of arm movemen...",False,7
500,"Fever of 100.6F, headache, body ache and chills",False,7
600,I am a 63-year-old male. I received the Modern...,False,8
700,The patient received his first dose of the Jan...,False,1
800,"hives, treated with benadryl",False,7
900,Stabbing Chest pains from a few hours after sh...,False,7


In [193]:
serious_docs = df.loc[df["SERIOUS"] == True]
serious_docs[["SYMPTOM_TEXT", "SERIOUS", "CLUSTER"]][:10]

Unnamed: 0,SYMPTOM_TEXT,SERIOUS,CLUSTER
235,After vaccine I am experiencing numbness in my...,True,8
236,Bilateral large pulmonary emboli with right ve...,True,0
258,Multiple episodes of syncope resulting in faci...,True,7
289,I experienced a pounding heartbeat and chest t...,True,8
308,12/27 - chills and body aches. Light pain in h...,True,7
321,2/04/2021 Experienced stabbing R peri-orbital ...,True,7
324,"December 28, 2021 - Moderna 1 January 30, 2021...",True,8
325,"On April 12 2021, I suddenly heard a loud nois...",True,7
327,"Fatigue, nausea, vomiting, vertigo, brain fog",True,7
334,"chest pain on 12/20/21 intermittently, then co...",True,7


In [194]:
print(f"Number of serious cases: {len(serious_docs)}")

Number of serious cases: 4332


In [195]:
freq_of_clusters = list(Counter(df["CLUSTER"]).items())
freq_of_clusters.sort(key=lambda clus_count: clus_count[0])

init_counts = dict()
for cluster_num in range(NUM_CLUSTERS):
    init_counts[cluster_num] = 0
cluster_serious_counts = Counter(init_counts)

cluster_serious_counts.update(serious_docs["CLUSTER"])
print(cluster_serious_counts)

Counter({0: 1886, 7: 1561, 8: 269, 2: 229, 4: 201, 6: 99, 1: 47, 9: 40, 3: 0, 5: 0})


In [196]:
print(list(cluster_serious_counts.values()))
chi_sq, p = chisquare(list(cluster_serious_counts.values()))
print(chi_sq,p)

[1886, 47, 229, 0, 201, 0, 99, 1561, 269, 40]
9916.684210526317 0.0


# Deliverable 3:

## Classifying the generated clusters


### Analyzing the clusters by count of serious cases


In [197]:
init_counts = dict()
for cluster_num in range(NUM_CLUSTERS):
    init_counts[cluster_num] = 0

# initializing counter in case we have a cluster that does not appear, we would want a count of 0
# and not have the cluster be undefined in the counter

cluster_non_serious_counts = Counter(
    df["CLUSTER"]) - cluster_serious_counts  # total count - serious

cluster_percent_serious = dict()

for cluster, serious_count in cluster_serious_counts.items():
    non_serious_count = cluster_non_serious_counts[cluster]
    percent_serious = (serious_count /
                       (serious_count + non_serious_count)) * 100
    cluster_percent_serious[cluster] = percent_serious
    print(
        f"Cluster {cluster}: {serious_count} serious cases {non_serious_count} non serious cases ({percent_serious:.2f}% serious)"
    )

Cluster 0: 1886 serious cases 364 non serious cases (83.82% serious)
Cluster 1: 47 serious cases 3094 non serious cases (1.50% serious)
Cluster 2: 229 serious cases 74 non serious cases (75.58% serious)
Cluster 3: 0 serious cases 841 non serious cases (0.00% serious)
Cluster 4: 201 serious cases 2917 non serious cases (6.45% serious)
Cluster 5: 0 serious cases 255 non serious cases (0.00% serious)
Cluster 6: 99 serious cases 2116 non serious cases (4.47% serious)
Cluster 7: 1561 serious cases 8200 non serious cases (15.99% serious)
Cluster 8: 269 serious cases 2146 non serious cases (11.14% serious)
Cluster 9: 40 serious cases 372 non serious cases (9.71% serious)


### Finding average age for each cluster, analyzing sex frequency


In [198]:
cluster_age_sum = dict()
cluster_ages_found = dict()
cluster_sex_count = dict()  # ignoring rows with value U or N/A
cluster_avg_age = dict()

for cluster in range(NUM_CLUSTERS):
    cluster_age_sum[cluster] = 0
    cluster_ages_found[cluster] = 0
    cluster_avg_age[cluster] = None
    cluster_sex_count[cluster] = {"M": 0, "F": 0}

for sex, cluster in zip(df["SEX"], df["CLUSTER"]):
    if isinstance(sex, str):
        cluster_ages_found[cluster] += 1
        if sex == "M":
            cluster_sex_count[cluster]["M"] += 1
        elif sex == "F":
            cluster_sex_count[cluster]["F"] += 1

for age, cluster in zip(df["AGE_YRS"], df["CLUSTER"]):
    if isinstance(age, (int, float)) and not np.isnan(age):
        cluster_ages_found[cluster] += 1
        cluster_age_sum[cluster] += age

for cluster, age_sum in cluster_age_sum.items():
    cluster_avg_age[cluster] = age_sum / cluster_ages_found[cluster]

cluster_sex_pct = dict()

for cluster, sex_counts in cluster_sex_count.items():
    m_count, f_count = sex_counts.values()
    total = m_count + f_count
    cluster_sex_pct[cluster] = {
        "M": (m_count / total) * 100,
        "F": (f_count / total) * 100
    }

for cluster in range(NUM_CLUSTERS):
    pct_male = cluster_sex_pct[cluster]["M"]
    pct_female = cluster_sex_pct[cluster]["F"]
    print(
        f"Cluster {cluster}: Average age {cluster_avg_age[cluster]:.2f}, {pct_male:.2f}% M, {pct_female:.2f}% F"
    )


Cluster 0: Average age 32.24, 50.69% M, 49.31% F
Cluster 1: Average age 16.09, 45.71% M, 54.29% F
Cluster 2: Average age 30.29, 51.32% M, 48.68% F
Cluster 3: Average age 18.81, 63.86% M, 36.14% F
Cluster 4: Average age 17.45, 35.66% M, 64.34% F
Cluster 5: Average age 14.89, 46.85% M, 53.15% F
Cluster 6: Average age 18.32, 38.24% M, 61.76% F
Cluster 7: Average age 21.82, 35.54% M, 64.46% F
Cluster 8: Average age 22.82, 23.23% M, 76.77% F
Cluster 9: Average age 24.54, 44.55% M, 55.45% F


In [199]:
for cluster in range(NUM_CLUSTERS):
    description = ""

    avg_age = cluster_avg_age[cluster]
    if avg_age < 18:
        description += "Young population, "
    elif avg_age >= 25:
        description += "Older population, "

    pct_male, pct_female = cluster_sex_pct[cluster].values()

    if abs(pct_male - pct_female) > 10:
        if pct_male > pct_female:
            description += "Mostly Males, "
        else:
            description += "Mostly Females, "
    else:
        description += "Males and Females, "

    pct_serious = cluster_percent_serious[cluster]

    if pct_serious == 0:
        description += "No serious cases"
    elif pct_serious < 10:
        description += "Few serious cases"
    elif pct_serious > 30:
        description += "Many serious cases"
    else:
        description += "Some serious cases"
    print(f"Cluster {cluster}: {description}")


Cluster 0: Older population, Males and Females, Many serious cases
Cluster 1: Young population, Males and Females, Few serious cases
Cluster 2: Older population, Males and Females, Many serious cases
Cluster 3: Mostly Males, No serious cases
Cluster 4: Young population, Mostly Females, Few serious cases
Cluster 5: Young population, Males and Females, No serious cases
Cluster 6: Mostly Females, Few serious cases
Cluster 7: Mostly Females, Some serious cases
Cluster 8: Mostly Females, Some serious cases
Cluster 9: Mostly Females, Few serious cases
