In [75]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/michael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/michael/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/michael/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/michael/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [76]:
df = pd.read_csv('2014VAERSDATA_cleaned.csv')
phrases = ['un', 'no', 'n/', 'nk']
for i, row in df.iterrows():
    if str(df.loc[i, 'CUR_ILL']).lower()[0:2] in phrases:
        df.loc[i, 'CUR_ILL'] = np.nan

    if str(df.loc[i, 'OTHER_MEDS']).lower()[0:2] in phrases:
        df.loc[i, 'OTHER_MEDS'] = np.nan

    if str(df.loc[i, 'HISTORY']).lower()[0:2] in phrases:
        df.loc[i, 'HISTORY'] = np.nan

In [77]:
df = df[["SYMPTOM_TEXT", "serious"]]
df.head()

Unnamed: 0,SYMPTOM_TEXT,serious
0,Hot pain at injection site; fever; fatigue; he...,No
1,"Red, hard bump at sight of injection.",No
2,"Fast heart rate, head ache, weakness, fever.",No
3,Baby had a hard time swallowing food and bottl...,No
4,"Severe ongoing headaches, severe vomiting afte...",No


In [78]:
print(df[["SYMPTOM_TEXT", "serious"]])

                                           SYMPTOM_TEXT serious
0     Hot pain at injection site; fever; fatigue; he...      No
1                 Red, hard bump at sight of injection.      No
2          Fast heart rate, head ache, weakness, fever.      No
3     Baby had a hard time swallowing food and bottl...      No
4     Severe ongoing headaches, severe vomiting afte...      No
...                                                 ...     ...
6828  Patient received PROQUAD 1-31-14 and mom calle...      No
6829  Patient stated on 3-21-14 arm was still sore a...     Yes
6830  Day following vaccines became faint, weak, tre...     Yes
6831  Started with increased temp after shot (lasted...      No
6832  Gave pt flu shot, walked out of immunizing roo...      No

[6833 rows x 2 columns]


In [79]:
# identify tokens
stopwords = set(nltk.corpus.stopwords.words("english"))

def get_word_tokens(row):
  if isinstance(row, str):
    text_without_stopwords = []
    for word in row.split():
      if word not in stopwords:
        text_without_stopwords.append(word)
    return word_tokenize(" ".join(text_without_stopwords))
  else:
    return "N\A"

df["tokens"] = df.apply(lambda row: get_word_tokens(row["SYMPTOM_TEXT"]), axis=1)
df.head()

Unnamed: 0,SYMPTOM_TEXT,serious,tokens
0,Hot pain at injection site; fever; fatigue; he...,No,"[Hot, pain, injection, site, ;, fever, ;, fati..."
1,"Red, hard bump at sight of injection.",No,"[Red, ,, hard, bump, sight, injection, .]"
2,"Fast heart rate, head ache, weakness, fever.",No,"[Fast, heart, rate, ,, head, ache, ,, weakness..."
3,Baby had a hard time swallowing food and bottl...,No,"[Baby, hard, time, swallowing, food, bottle, ...."
4,"Severe ongoing headaches, severe vomiting afte...",No,"[Severe, ongoing, headaches, ,, severe, vomiti..."


In [80]:
# named entities, parts of speech
def get_pos_tagged(row):
  row = row["tokens"]
  if row != "N\A":
    return nltk.pos_tag(row)
  else:
    return "N\A"

def get_named_entities(row):
  pos_tagged = row["parts_of_speech"]
  if pos_tagged != "N\A":
    return nltk.ne_chunk(pos_tagged)
  else:
    return "N\A"

df["parts_of_speech"] = df.apply(get_pos_tagged, axis=1)
df["named_entites"] = df.apply(get_named_entities, axis=1)
df.head()

Unnamed: 0,SYMPTOM_TEXT,serious,tokens,parts_of_speech,named_entites
0,Hot pain at injection site; fever; fatigue; he...,No,"[Hot, pain, injection, site, ;, fever, ;, fati...","[(Hot, NNP), (pain, NN), (injection, NN), (sit...","[[(Hot, NNP)], (pain, NN), (injection, NN), (s..."
1,"Red, hard bump at sight of injection.",No,"[Red, ,, hard, bump, sight, injection, .]","[(Red, NNP), (,, ,), (hard, JJ), (bump, NN), (...","[[(Red, NNP)], (,, ,), (hard, JJ), (bump, NN),..."
2,"Fast heart rate, head ache, weakness, fever.",No,"[Fast, heart, rate, ,, head, ache, ,, weakness...","[(Fast, NNP), (heart, NN), (rate, NN), (,, ,),...","[[(Fast, NNP)], (heart, NN), (rate, NN), (,, ,..."
3,Baby had a hard time swallowing food and bottl...,No,"[Baby, hard, time, swallowing, food, bottle, ....","[(Baby, NNP), (hard, JJ), (time, NN), (swallow...","[[(Baby, NNP)], (hard, JJ), (time, NN), (swall..."
4,"Severe ongoing headaches, severe vomiting afte...",No,"[Severe, ongoing, headaches, ,, severe, vomiti...","[(Severe, JJ), (ongoing, VBG), (headaches, NNS...","[(Severe, JJ), (ongoing, VBG), (headaches, NNS..."


In [81]:
# stems, lematta
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

df["stems"] = df.apply(lambda row: stemmer.stem(" ".join(row["tokens"])), axis=1)
df.head()

Unnamed: 0,SYMPTOM_TEXT,serious,tokens,parts_of_speech,named_entites,stems
0,Hot pain at injection site; fever; fatigue; he...,No,"[Hot, pain, injection, site, ;, fever, ;, fati...","[(Hot, NNP), (pain, NN), (injection, NN), (sit...","[[(Hot, NNP)], (pain, NN), (injection, NN), (s...",hot pain injection site ; fever ; fatigue ; he...
1,"Red, hard bump at sight of injection.",No,"[Red, ,, hard, bump, sight, injection, .]","[(Red, NNP), (,, ,), (hard, JJ), (bump, NN), (...","[[(Red, NNP)], (,, ,), (hard, JJ), (bump, NN),...","red , hard bump sight injection ."
2,"Fast heart rate, head ache, weakness, fever.",No,"[Fast, heart, rate, ,, head, ache, ,, weakness...","[(Fast, NNP), (heart, NN), (rate, NN), (,, ,),...","[[(Fast, NNP)], (heart, NN), (rate, NN), (,, ,...","fast heart rate , head ache , weakness , fever ."
3,Baby had a hard time swallowing food and bottl...,No,"[Baby, hard, time, swallowing, food, bottle, ....","[(Baby, NNP), (hard, JJ), (time, NN), (swallow...","[[(Baby, NNP)], (hard, JJ), (time, NN), (swall...",baby hard time swallowing food bottle . develo...
4,"Severe ongoing headaches, severe vomiting afte...",No,"[Severe, ongoing, headaches, ,, severe, vomiti...","[(Severe, JJ), (ongoing, VBG), (headaches, NNS...","[(Severe, JJ), (ongoing, VBG), (headaches, NNS...","severe ongoing headaches , severe vomiting two..."


For non-SAS users, the software may not have procedures to produce a diagram similar to the one from SAS, but it should provide a way to show the most frequent terms and how strongly they are associated with each other. A typical tf-idf table is shown in the csv results file in this Module’s folder. Show a similar tf-idf table with columns for POS, name entities, importance measure(s), and some visualization or table of word associations.

In [82]:
from collections import defaultdict

def get_counts(row):
  counts = defaultdict(int)
  row = row["stems"]
  if not isinstance(row, list):
    for word in row.split(" "):
      counts[word] += 1
  return counts

df["word_counts"] = df.apply(get_counts, axis=1)
df.head()

Unnamed: 0,SYMPTOM_TEXT,serious,tokens,parts_of_speech,named_entites,stems,word_counts
0,Hot pain at injection site; fever; fatigue; he...,No,"[Hot, pain, injection, site, ;, fever, ;, fati...","[(Hot, NNP), (pain, NN), (injection, NN), (sit...","[[(Hot, NNP)], (pain, NN), (injection, NN), (s...",hot pain injection site ; fever ; fatigue ; he...,"{'hot': 1, 'pain': 3, 'injection': 2, 'site': ..."
1,"Red, hard bump at sight of injection.",No,"[Red, ,, hard, bump, sight, injection, .]","[(Red, NNP), (,, ,), (hard, JJ), (bump, NN), (...","[[(Red, NNP)], (,, ,), (hard, JJ), (bump, NN),...","red , hard bump sight injection .","{'red': 1, ',': 1, 'hard': 1, 'bump': 1, 'sigh..."
2,"Fast heart rate, head ache, weakness, fever.",No,"[Fast, heart, rate, ,, head, ache, ,, weakness...","[(Fast, NNP), (heart, NN), (rate, NN), (,, ,),...","[[(Fast, NNP)], (heart, NN), (rate, NN), (,, ,...","fast heart rate , head ache , weakness , fever .","{'fast': 1, 'heart': 1, 'rate': 1, ',': 3, 'he..."
3,Baby had a hard time swallowing food and bottl...,No,"[Baby, hard, time, swallowing, food, bottle, ....","[(Baby, NNP), (hard, JJ), (time, NN), (swallow...","[[(Baby, NNP)], (hard, JJ), (time, NN), (swall...",baby hard time swallowing food bottle . develo...,"{'baby': 1, 'hard': 1, 'time': 1, 'swallowing'..."
4,"Severe ongoing headaches, severe vomiting afte...",No,"[Severe, ongoing, headaches, ,, severe, vomiti...","[(Severe, JJ), (ongoing, VBG), (headaches, NNS...","[(Severe, JJ), (ongoing, VBG), (headaches, NNS...","severe ongoing headaches , severe vomiting two...","{'severe': 3, 'ongoing': 2, 'headaches': 1, ',..."


In [83]:
# get overall term frequencies
freqs = defaultdict(int)
num_docs = defaultdict(int)

for i, row in df.iterrows():
  counts = row["word_counts"]
  row_words = set(counts.keys())
  for word in row_words:
    num_docs[word] += 1
  for word, freq in counts.items():
    freqs[word] += freq

print(list(freqs.items())[:10])

[('hot', 206), ('pain', 1861), ('injection', 1884), ('site', 1734), (';', 808), ('fever', 972), ('fatigue', 159), ('headache', 382), ('muscle', 208), ('arm', 1808)]


In [84]:
tf_table = pd.DataFrame()

tf_table["term"] = [key for key in freqs.keys() if key not in ",.;()"] # get all terms, exclude punctuation
tf_table["freq"] = [freqs[term] for term in tf_table["term"]]
tf_table["numdocs"] = [num_docs[term] for term in tf_table["term"]]
tf_table["part_of_speech"] = [nltk.pos_tag(word_tokenize(term)) for term in tf_table["term"]]
tf_table.head(10)

Unnamed: 0,term,freq,numdocs,part_of_speech
0,hot,206,192,"[(hot, JJ)]"
1,pain,1861,1050,"[(pain, NN)]"
2,injection,1884,1248,"[(injection, NN)]"
3,site,1734,1169,"[(site, NN)]"
4,fever,972,749,"[(fever, NN)]"
5,fatigue,159,132,"[(fatigue, NN)]"
6,headache,382,294,"[(headache, NN)]"
7,muscle,208,174,"[(muscle, NN)]"
8,arm,1808,1130,"[(arm, NN)]"
9,shoulder,356,272,"[(shoulder, NN)]"


In [85]:
tf_table = tf_table.sort_values("numdocs", ascending=False) # sort with terms in most documents first
tf_table["rank"] = range(1, len(tf_table) + 1) # rank of terms by number of documents they occur in
tf_table.head(15).style.set_caption("Term Frequencies")

Unnamed: 0,term,freq,numdocs,part_of_speech,rank
139,patient,9223,3122,"[('patient', 'NN')]",1
151,received,4500,2985,"[('received', 'VBN')]",2
194,reported,5779,2520,"[('reported', 'VBD')]",3
148,this,3023,2342,"[('this', 'DT')]",4
150,report,2902,2245,"[('report', 'NN')]",5
169,information,3251,2196,"[('information', 'NN')]",6
179,unknown,4483,2126,"[('unknown', 'JJ')]",7
162,dose,3902,2120,"[('dose', 'NN')]",8
191,additional,2184,2120,"[('additional', 'JJ')]",9
182,the,5582,2089,"[('the', 'DT')]",10


In [86]:
# tf-idf

"""
 t = term
 d = document (row)
 N = num of documents (rows in dataframe)

 for each row in the original dataframe (each document):
   tf(t, d) = (count of t in d/ number of words in d)
   df(t) = "numdocs" of t
   idf(t) = N/df

  tf-idf(t, d) = tf(t, d) * idf(t)
"""

from math import log
N = len(df)

def calc_tf_idf(row):
  row_tf_idf_vals: list[tuple[str, int]] = [] # list of (word, tf_idf)
  row_word_counts = row["word_counts"]
  total_row_words = sum(row_word_counts.values()) # number of words in this document
  for word, count in row_word_counts.items():
    if word in ",.;()+-_": continue # ignoring punctuation
    df = num_docs[word] # num of occurances in documents of this word
    term_freq = count / total_row_words # freq of this term divided by total num words in this doc
    idf = log(N / df)
    tf_idf = term_freq * idf
    row_tf_idf_vals.append((word, tf_idf))
  row_tf_idf_vals.sort(key=lambda word_idf_tuple: -word_idf_tuple[1]) # sort by tf_idf value
  return row_tf_idf_vals

df["tf_idf"] = df.apply(calc_tf_idf, axis=1)
df[["SYMPTOM_TEXT", "tf_idf"]].head(20)

Unnamed: 0,SYMPTOM_TEXT,tf_idf
0,Hot pain at injection site; fever; fatigue; he...,"[(fatigue, 0.26311447814629385), (shoulder, 0...."
1,"Red, hard bump at sight of injection.","[(sight, 0.8407257308020483), (bump, 0.7494285..."
2,"Fast heart rate, head ache, weakness, fever.","[(fast, 0.6029358652222326), (rate, 0.45075618..."
3,Baby had a hard time swallowing food and bottl...,"[(ignored, 0.19372314081478173), (bottle, 0.16..."
4,"Severe ongoing headaches, severe vomiting afte...","[(ongoing, 0.2458748044676316), (severe, 0.227..."
5,Severe headache with ongoing headaches since i...,"[(ongoing, 0.6782753226693286), (since, 0.3342..."
6,Vaccinated 12/17 and got bumps 12/26. Thought ...,"[(got, 0.46389385255004756), (1/1, 0.383892134..."
7,Symptoms included nonstop screaming and franti...,"[(frantic, 0.5193834761635752), (jerky, 0.4786..."
8,"Severe joint and muscle pain, chronic fatigue,...","[(muscle, 0.3552061737644757), (chronic, 0.344..."
9,Extreme swelling on vaccinated leg. Red and h...,"[(irritability, 0.3139702924684907), (walking,..."


# Deliverables 3, 4:
> Respond to Exercise 4. at the end of Chapter 2 of the text.  Extend the consideration of just stems to lemmata and synonyms.  How does the use of a single token, like the stem, lemmata, and one of the terms in each SynSet (a set of all the different words meaning  the same thing in a context), enhance the reliability of identifying the important concepts in a document/corpus?

The use of tokens, synonyms, stems. etc. improves the "readabillity" of the text via standardizing the variables that are taken in by the machine learning model. This process makes it possible for the model to compare the interactions of these terms throughout the text by distilling the core meaning behind the word without including gramatical excess. This standardization allows the essential logical functions of asserting equality and interaction of terms across text(s) possible. Without it the algorithm would get caught up on meaningless patterns which exist only to aid human comprehension of text.


> Briefly describe what each of the outputs show and how their content relates to the material in Chapter 2 of Weiss et al. You can include images of the SAS panels or other software graphical output in your report but do not show the entire tf-idf table, just show the column headings and a few of the rows. If later rows have interesting entries try to just copy those or identify them in other ways and mention why you think they are relevant and/or interesting.

The second to last table shown shows terms sorted by the ones that appear in the most number of documents(rows) from the data set. It provides and presents the terms, frequencies, and number of documents the terms were present in, which is needed for the tf-idf scores.
The last table shows the tf-idf values for each row of text, and shows the values given to each word which gives us information about the relative rarity of the word, or the importance of that word that may differentiate it against other documents.