In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/michael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score
from sklearn import tree

In [6]:
df = pd.read_csv('2022VAERSData.csv', encoding='cp1252', low_memory=False)
df.dropna(subset=["SYMPTOM_TEXT"], inplace=True, axis=0)
df = df.reset_index()


def is_serious(row):
    columns = ["DIED", "ER_VISIT", "HOSPITAL", "DISABLE"]
    for val in row[columns]:
        if isinstance(val, str) and val.upper() == "Y":
            return True
    return False


df["SERIOUS"] = df.apply(is_serious, axis=1)
print(f"Starting number of documents: {len(df)}")

Starting number of documents: 24711


In [7]:
data = df['SYMPTOM_TEXT']
data = np.array(data)
target = df["SERIOUS"]
target = np.array(target)

In [8]:
data_train, data_test, y_train, y_test = train_test_split(data, target, 
                                                          test_size=0.33, random_state=42)

In [9]:
stop_words = set(
    stopwords.words('english') +
    [".", ":", ";", "(", ")", ",", "#", "'", "\"", '!', '$', '%', '&', "''"])

porter = PorterStemmer()


def stopword_remover(lst):
    return [word for word in lst if word not in stop_words]


def stemmer(lst):
    return [porter.stem(word) for word in lst if word]


def text_preprocess(d):
    tokens = np.array([word_tokenize(i) for i in d], dtype=object)
    tok_fil = [stopword_remover(doc) for doc in tokens]
    tok_stem = [stemmer(doc) for doc in tok_fil]
    return tok_stem

In [10]:
data_train_processed = text_preprocess(data_train)

In [11]:
data_test_processed = text_preprocess(data_test)

In [12]:
data_processed = text_preprocess(data)

In [13]:
vectorizer = TfidfVectorizer(tokenizer=lambda i: i, lowercase=False)
doc_term_matrix = vectorizer.fit_transform(data_processed)

In [14]:
data_train, data_test, y_train, y_test = train_test_split(doc_term_matrix, target, 
                                                          test_size=0.33, random_state=42)

In [15]:
doc_term_matrix.shape

(24711, 33840)

In [16]:
dtc = tree.DecisionTreeClassifier()
dtc.fit(data_train, y_train)

DecisionTreeClassifier()

In [17]:
pred = dtc.predict(data_test)

In [18]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
confusion_matrix(y_test, pred)

array([[6434,  310],
       [ 395, 1016]])

In [19]:
f"True positive {tp} True negative {tn} False positive {fp} False negative {fn}"

'True positive 1016 True negative 6434 False positive 310 False negative 395'

In [20]:
f"Accuracy: {accuracy_score(y_test, pred)}"

'Accuracy: 0.9135499693439607'

In [21]:
f"Precision: {precision_score(y_test, pred)}"

'Precision: 0.7662141779788839'

In [22]:
f"F1 score: {f1_score(y_test, pred)}"

'F1 score: 0.7424187066130801'

In [23]:
f"Recall: {recall_score(y_test, pred)}"

'Recall: 0.7200566973777462'

In [24]:
f"Specificity: {tn / (tn+fp)}"

'Specificity: 0.9540332147093713'

In [26]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import display,SVG
import pydotplus

dot_data = StringIO()
export_graphviz(dtc, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,class_names=['Serious','Not Serious'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  

In [27]:
graph.write_svg("tree.svg")
display(SVG(graph.create_svg()))

InvocationException: GraphViz's executables not found