In [60]:
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string

In [61]:
re2019 = pd.read_csv('GOUT-CC-2019-CORPUS-REDACTED.csv')
re2019.head()

Unnamed: 0,Chief Complaint,Predict,Consensus
0,"""been feeling bad"" last 2 weeks & switched BP ...",N,-
1,"""can't walk"", reports onset at <<TIME>>. orien...",Y,N
2,"""dehydration"" Chest hurts, hips hurt, cramps P...",Y,Y
3,"""gout flare up"" L arm swelling x 1 week. denie...",Y,Y
4,"""heart racing,""dyspnea, and orthopnea that has...",N,-


In [62]:
syn2019 = pd.read_csv('GOUT-CC-2019-CORPUS-SYNTHETIC.csv')
syn2019.head()

Unnamed: 0,Chief Complaint,Predict,Consensus
0,"""been feeling bad"" last 2 weeks & switched BP ...",N,-
1,"""can't walk"", reports onset at 0830 am. orient...",Y,N
2,"""dehydration"" Chest hurts, hips hurt, cramps P...",Y,Y
3,"""gout flare up"" L arm swelling x 1 week. denie...",Y,Y
4,"""heart racing,""dyspnea, and orthopnea that has...",N,-


In [63]:
re2020 = pd.read_csv('GOUT-CC-2020-CORPUS-REDACTED.csv')
re2020.head()

Unnamed: 0,Chief Complaint,Predict,Consensus
0,"""I dont know whats going on with my head, its ...",N,-
1,"""i've been depressed for a few weeks now, i'm ...",N,-
2,"Altercation while making arrest, c/o R hand pa...",N,N
3,Cut on L upper thigh wtih saw. Bleeding contro...,N,N
4,"Dysuria x1 week. hx: hysterectomy, gerd, bipolar",N,-


In [64]:
syn2020 = pd.read_csv('GOUT-CC-2020-CORPUS-SYNTHETIC.csv')
syn2020.head()

Unnamed: 0,Chief Complaint,Predict,Consensus
0,"""I dont know whats going on with my head, its ...",N,-
1,"""i've been depressed for a few weeks now, i'm ...",N,-
2,"Altercation while making arrest, c/o R hand pa...",N,N
3,Cut on L upper thigh wtih saw. Bleeding contro...,N,N
4,"Dysuria x1 week. hx: hysterectomy, gerd, bipolar",N,-


In [65]:
print(re2019.columns, syn2019.columns, re2020.columns, syn2020.columns, sep="\n")

Index(['Chief Complaint', 'Predict', 'Consensus'], dtype='object')
Index(['Chief Complaint', 'Predict', 'Consensus'], dtype='object')
Index(['Chief Complaint', 'Predict', 'Consensus'], dtype='object')
Index(['Chief Complaint', 'Predict', 'Consensus'], dtype='object')


In [66]:
syn2019.describe

<bound method NDFrame.describe of                                        Chief Complaint Predict Consensus
0    "been feeling bad" last 2 weeks & switched BP ...       N         -
1    "can't walk", reports onset at 0830 am. orient...       Y         N
2    "dehydration" Chest hurts, hips hurt, cramps P...       Y         Y
3    "gout flare up" L arm swelling x 1 week. denie...       Y         Y
4    "heart racing,"dyspnea, and orthopnea that has...       N         -
..                                                 ...     ...       ...
295  upper abd/R side chest pain x1 month, new onse...       N         N
296  upper lip swelling x one day, pmh HTN, COPD, b...       N         N
297  walked outside of a gas station and began bein...       N         -
298  was getting prepped for colonoscopy and was se...       N         N
299  Was seen at CGH after an MVC. Pt states they r...       N         N

[300 rows x 3 columns]>

In [67]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct
re2019["Chief Complaint"] = re2019["Chief Complaint"].apply(lambda x: remove_punctuation(x))
re2019.head()

Unnamed: 0,Chief Complaint,Predict,Consensus
0,been feeling bad last 2 weeks switched BP med...,N,-
1,cant walk reports onset at TIME oriented x2 ao...,Y,N
2,dehydration Chest hurts hips hurt cramps PMH H...,Y,Y
3,gout flare up L arm swelling x 1 week denies a...,Y,Y
4,heart racingdyspnea and orthopnea that has bee...,N,-


In [68]:
print(re2019.shape, syn2019.shape, re2020.shape, syn2020.shape, sep="\n")

(300, 3)
(300, 3)
(8137, 3)
(8137, 3)


In [69]:
tokenizer = RegexpTokenizer(r'\w+')

In [70]:
re2019['Chief Complaint'] = re2019['Chief Complaint'].apply(lambda x: tokenizer.tokenize(x.lower()))
re2019['Chief Complaint'].head(20)

0     [been, feeling, bad, last, 2, weeks, switched,...
1     [cant, walk, reports, onset, at, time, oriente...
2     [dehydration, chest, hurts, hips, hurt, cramps...
3     [gout, flare, up, l, arm, swelling, x, 1, week...
4     [heart, racingdyspnea, and, orthopnea, that, h...
5     [i, started, breathing, hard, hx, htn, gout, a...
6     [i, think, i, have, a, gout, flare, up, l, wri...
7     [i, want, to, see, if, i, have, an, infection,...
8     [my, gout, done, flared, up, on, me, co, r, an...
9     [my, gout, is, hurting, me, reports, bilateral...
10    [out, of, my, mental, health, pills, and, im, ...
11      [umbearable, right, footankle, pain, pmh, gout]
12    [hospital, transfer, for, renal, transplant, p...
13    [hospital, reports, feeling, nervous, about, h...
14    [organization, dizziness, and, sob, recent, ht...
15    [organization, intermittent, cp, since, this, ...
16    [organization, right, sided, abdominal, pain, ...
17    [hospital, tx, neck, pain, and, right, arm

In [71]:
stemmer = PorterStemmer()

In [72]:
def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

In [73]:
re2019['Chief Complaint'] = re2019['Chief Complaint'].apply(lambda x: word_stemmer(x))

In [74]:
re2019['Chief Complaint']

0      been feel bad last 2 week switch bp medic last...
1      cant walk report onset at time orient x2 aorti...
2      dehydr chest hurt hip hurt cramp pmh hip repla...
3      gout flare up l arm swell x 1 week deni ani ot...
4      heart racingdyspnea and orthopnea that ha been...
                             ...                        
295    upper abdr side chest pain x1 month new onset ...
296    upper lip swell x one day pmh htn copd blind i...
297    walk outsid of a ga station and began be shock...
298    wa get prep for colonoscopi and wa sent to er ...
299    wa seen at hospit after an mvc pt state they r...
Name: Chief Complaint, Length: 300, dtype: object

In [82]:
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michelleide/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [83]:
lemmatizer = WordNetLemmatizer()

In [84]:
def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [85]:
re2019['Chief Complaint'].apply(lambda x: word_lemmatizer(x))

0      [b, e, e, n,  , f, e, e, l,  , b, a, d,  , l, ...
1      [c, a, n, t,  , w, a, l, k,  , r, e, p, o, r, ...
2      [d, e, h, y, d, r,  , c, h, e, s, t,  , h, u, ...
3      [g, o, u, t,  , f, l, a, r, e,  , u, p,  , l, ...
4      [h, e, a, r, t,  , r, a, c, i, n, g, d, y, s, ...
                             ...                        
295    [u, p, p, e, r,  , a, b, d, r,  , s, i, d, e, ...
296    [u, p, p, e, r,  , l, i, p,  , s, w, e, l, l, ...
297    [w, a, l, k,  , o, u, t, s, i, d,  , o, f,  , ...
298    [w, a,  , g, e, t,  , p, r, e, p,  , f, o, r, ...
299    [w, a,  , s, e, e, n,  , a, t,  , h, o, s, p, ...
Name: Chief Complaint, Length: 300, dtype: object