In [164]:
Example_Sentence = "Patients who in late middle age have smoked 20 cigarettes a day since their teens constitute an at-risk group. One thing they’re clearly at risk for is the acute sense of guilt that a clinician can incite, which immediately makes a consultation tense."

In [171]:
nlp = spacy.load('en_core_web_sm')


# Spacy

In [172]:
def spacy_process(text):
    doc = nlp(text)
    
    #Tokenization and lemmatization are done with the spacy nlp pipeline commands
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    print("Tokenize+Lemmatize:")
    print(lemma_list)
    
    #Filter the stopword
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    #Remove punctuation
    punctuations="?:!.,;"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
    print(" ")
    print("Remove stopword & punctuation: ")
    print(filtered_sentence)


In [168]:
import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

p_stemmer = PorterStemmer()
#s_stemmer = SnowballStemmer(language='english')

def nltk_process(text):
    #Tokenization
    nltk_tokenList = word_tokenize(text)
    
    #Stemming
    nltk_stemedList = []
    for word in nltk_tokenList:
        nltk_stemedList.append(p_stemmer.stem(word))
        #nltk_stemedList.append(s_stemmer.stem(word))
    
    #Lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    nltk_lemmaList = []
    for word in nltk_stemedList:
        nltk_lemmaList.append(wordnet_lemmatizer.lemmatize(word))
    
    print("Stemming + Lemmatization")
    print(nltk_lemmaList)

    #Filter stopword
    filtered_sentence = []  
    nltk_stop_words = set(stopwords.words("english"))
    for w in nltk_lemmaList:  
        if w not in nltk_stop_words:  
            filtered_sentence.append(w)  

    #Removing Punctuation
    punctuations="?:!.,;"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
    print(" ")
    print("Remove stopword & Punctuation")
    print(filtered_sentence)


In [173]:
%%time

nltk_process(Example_Sentence)

Stemming + Lemmatization
['patient', 'who', 'in', 'late', 'middl', 'age', 'have', 'smoke', '20', 'cigarett', 'a', 'day', 'sinc', 'their', 'teen', 'constitut', 'an', 'at-risk', 'group', '.', 'one', 'thing', 'they', '’', 're', 'clearli', 'at', 'risk', 'for', 'is', 'the', 'acut', 'sen', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incit', ',', 'which', 'immedi', 'make', 'a', 'consult', 'ten', '.']
 
Remove stopword & Punctuation
['patient', 'late', 'middl', 'age', 'smoke', '20', 'cigarett', 'day', 'sinc', 'teen', 'constitut', 'at-risk', 'group', 'one', 'thing', '’', 'clearli', 'risk', 'acut', 'sen', 'guilt', 'clinician', 'incit', 'immedi', 'make', 'consult', 'ten']
Wall time: 1.99 ms


In [174]:
%%time
spacy_process(Example_Sentence)

Tokenize+Lemmatize:
['patient', 'who', 'in', 'late', 'middle', 'age', 'have', 'smoke', '20', 'cigarette', 'a', 'day', 'since', '-PRON-', 'teen', 'constitute', 'an', 'at', '-', 'risk', 'group', '.', 'one', 'thing', '-PRON-', 'be', 'clearly', 'at', 'risk', 'for', 'be', 'the', 'acute', 'sense', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incite', ',', 'which', 'immediately', 'make', 'a', 'consultation', 'tense', '.']
 
Remove stopword & punctuation: 
['patient', 'late', 'middle', 'age', 'smoke', '20', 'cigarette', 'day', '-PRON-', 'teen', 'constitute', '-', 'risk', 'group', 'thing', '-PRON-', 'clearly', 'risk', 'acute', 'sense', 'guilt', 'clinician', 'incite', 'immediately', 'consultation', 'tense']
Wall time: 15 ms


### About spaCy's custom pronoun lemma for English
spaCy adds a special case for English pronouns: all English pronouns are lemmatized to the special token -PRON-. Unlike verbs and common nouns, there’s no clear base form of a personal pronoun. Should the lemma of “me” be “I”, or should we normalize person as well, giving “it” — or maybe “he”? spaCy’s solution is to introduce a novel symbol, -PRON-, which is used as the lemma for all personal pronouns.

# My Own Reference:

In [119]:
%%time
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_md')

# Create a Doc object
doc = nlp(Example_Sentence)

# Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_, token.lemma_)

Patients NOUN ROOT patient
who PRON nsubj who
in ADP prep in
late ADJ amod late
middle ADJ amod middle
age NOUN pobj age
have AUX aux have
smoked VERB relcl smoke
20 NUM nummod 20
cigarettes NOUN dobj cigarette
a DET det a
day NOUN npadvmod day
since SCONJ mark since
their DET poss -PRON-
teens NOUN nsubj teen
constitute VERB advcl constitute
an DET det an
at ADP nmod at
- PUNCT punct -
risk NOUN pobj risk
group NOUN dobj group
. PUNCT punct .
One NUM nummod one
thing NOUN npadvmod thing
they PRON nsubj -PRON-
’re VERB csubj be
clearly ADV advmod clearly
at ADP prep at
risk NOUN pobj risk
for ADP prep for
is AUX ROOT be
the DET det the
acute ADJ amod acute
sense NOUN attr sense
of ADP prep of
guilt NOUN pobj guilt
that SCONJ mark that
a DET det a
clinician NOUN nsubj clinician
can VERB aux can
incite VERB relcl incite
, PUNCT punct ,
which DET nsubj which
immediately ADV advmod immediately
makes VERB relcl make
a DET det a
consultation NOUN nsubj consultation
tense ADJ ccomp tense
. PU

In [117]:
print(nlp.Defaults.stop_words)
print(len(nlp.Defaults.stop_words))

{'wherein', 'used', '’ve', 'call', 'since', 'so', 'upon', 'onto', 'whereby', 'first', 'four', 'herself', 'beforehand', "'d", 'i', 'becoming', 'show', 'even', 'ever', 'same', 'always', 'top', 'and', 'meanwhile', 'been', 'another', 'rather', 'seems', 'that', 'whereas', 'these', 'somewhere', 'only', 'mine', 'amount', 'other', 'you', 'never', 'due', 'keep', 'does', 'be', 'nor', 'seeming', 'he', 'former', 'put', 'others', 'ours', 'were', '’re', 'than', 'although', 'less', 'against', 'afterwards', "'m", 'quite', 'regarding', 'thereafter', 'what', 'toward', 'nine', 'however', 'whose', 'us', 'his', 'fifty', 'thence', 'fifteen', 'eight', 'across', 'move', 'twelve', 'almost', 'thereupon', '’ll', 'me', 'nevertheless', 'up', 'would', 'hereupon', 'whoever', 'during', 'whenever', 'then', '’m', 'below', '’d', 'we', 'did', 'if', 'why', 'here', 'again', 'nowhere', 'seemed', 'amongst', 'further', 'along', 'otherwise', 'somehow', 'she', '‘m', 'but', 'noone', 'one', 'between', 'indeed', 'must', 'neither',

In [97]:
#Tokenizer
token_listSC = []
for token in doc:
    token_listSC.append(token.text)

print(token_listSC)

['Patients', 'who', 'in', 'late', 'middle', 'age', 'have', 'smoked', '20', 'cigarettes', 'a', 'day', 'since', 'their', 'teens', 'constitute', 'an', 'at', '-', 'risk', 'group', '.', 'One', 'thing', 'they', '’re', 'clearly', 'at', 'risk', 'for', 'is', 'the', 'acute', 'sense', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incite', ',', 'which', 'immediately', 'makes', 'a', 'consultation', 'tense', '.']


In [98]:

lemma_list = []
for token in doc:
        lemma_list.append(token.lemma_)

print(lemma_list)

['patient', 'who', 'in', 'late', 'middle', 'age', 'have', 'smoke', '20', 'cigarette', 'a', 'day', 'since', '-PRON-', 'teen', 'constitute', 'an', 'at', '-', 'risk', 'group', '.', 'one', 'thing', '-PRON-', 'be', 'clearly', 'at', 'risk', 'for', 'be', 'the', 'acute', 'sense', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incite', ',', 'which', 'immediately', 'make', 'a', 'consultation', 'tense', '.']


In [99]:
# Create list of word tokens after removing stopwords
filtered_sentence =[] 

for word in lemma_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_sentence.append(word) 
print(lemma_list)
print(filtered_sentence)   

['patient', 'who', 'in', 'late', 'middle', 'age', 'have', 'smoke', '20', 'cigarette', 'a', 'day', 'since', '-PRON-', 'teen', 'constitute', 'an', 'at', '-', 'risk', 'group', '.', 'one', 'thing', '-PRON-', 'be', 'clearly', 'at', 'risk', 'for', 'be', 'the', 'acute', 'sense', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incite', ',', 'which', 'immediately', 'make', 'a', 'consultation', 'tense', '.']
['patient', 'late', 'middle', 'age', 'smoke', '20', 'cigarette', 'day', '-PRON-', 'teen', 'constitute', '-', 'risk', 'group', '.', 'thing', '-PRON-', 'clearly', 'risk', 'acute', 'sense', 'guilt', 'clinician', 'incite', ',', 'immediately', 'consultation', 'tense', '.']


In [100]:
#Removing Punctuation
punctuations="?:!.,;"
for word in filtered_sentence:
    if word in punctuations:
        filtered_sentence.remove(word)
        
print(filtered_sentence)

['patient', 'late', 'middle', 'age', 'smoke', '20', 'cigarette', 'day', '-PRON-', 'teen', 'constitute', '-', 'risk', 'group', 'thing', '-PRON-', 'clearly', 'risk', 'acute', 'sense', 'guilt', 'clinician', 'incite', 'immediately', 'consultation', 'tense']


# NLTK

In [136]:
import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

In [104]:
p_stemmer = PorterStemmer()
#s_stemmer = SnowballStemmer(language='english')

In [137]:
nltk_tokenList = word_tokenize(Example_Sentence) 


In [141]:
nltk_stemedList = []
for word in nltk_tokenList:
    nltk_stemedList.append(p_stemmer.stem(word))
    #nltk_stemedList.append(s_stemmer.stem(word))

print(nltk_stemedList)



['patient', 'who', 'in', 'late', 'middl', 'age', 'have', 'smoke', '20', 'cigarett', 'a', 'day', 'sinc', 'their', 'teen', 'constitut', 'an', 'at-risk', 'group', '.', 'one', 'thing', 'they', '’', 're', 'clearli', 'at', 'risk', 'for', 'is', 'the', 'acut', 'sens', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incit', ',', 'which', 'immedi', 'make', 'a', 'consult', 'tens', '.']


In [142]:
wordnet_lemmatizer = WordNetLemmatizer()


In [143]:
nltk_lemmaList = []
for word in nltk_stemedList:
    nltk_lemmaList.append(wordnet_lemmatizer.lemmatize(word))

print(nltk_lemmaList)

['patient', 'who', 'in', 'late', 'middl', 'age', 'have', 'smoke', '20', 'cigarett', 'a', 'day', 'sinc', 'their', 'teen', 'constitut', 'an', 'at-risk', 'group', '.', 'one', 'thing', 'they', '’', 're', 'clearli', 'at', 'risk', 'for', 'is', 'the', 'acut', 'sen', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incit', ',', 'which', 'immedi', 'make', 'a', 'consult', 'ten', '.']


In [151]:
filtered_sentence = []  

nltk_stop_words = set(stopwords.words("english"))

for w in nltk_lemmaList:  
    if w not in nltk_stop_words:  
        filtered_sentence.append(w)  

print(filtered_sentence)


['patient', 'late', 'middl', 'age', 'smoke', '20', 'cigarett', 'day', 'sinc', 'teen', 'constitut', 'at-risk', 'group', '.', 'one', 'thing', '’', 'clearli', 'risk', 'acut', 'sen', 'guilt', 'clinician', 'incit', ',', 'immedi', 'make', 'consult', 'ten', '.']


In [152]:
#Removing Punctuation
punctuations="?:!.,;"
for word in filtered_sentence:
    if word in punctuations:
        filtered_sentence.remove(word)

print(filtered_sentence)



['patient', 'late', 'middl', 'age', 'smoke', '20', 'cigarett', 'day', 'sinc', 'teen', 'constitut', 'at-risk', 'group', 'one', 'thing', '’', 'clearli', 'risk', 'acut', 'sen', 'guilt', 'clinician', 'incit', 'immedi', 'make', 'consult', 'ten']


# SK-Learn

In [33]:
from sklearn.feature_extraction import text
stopwordsSK = text.ENGLISH_STOP_WORDS
len(stopwordsSK)

318

In [34]:
print(sorted(list(stopwordsSK)))

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give

In [35]:
filtered_sentence = []  

word_tokens = word_tokenize(Example_Sentence)  


for w in word_tokens:  
    if w not in stopwordsSK:  
        filtered_sentence.append(w)  

print(word_tokens)  
print(filtered_sentence)

['Tesla', 'is', 'looking', 'at', 'buying', 'U.S.', 'startup', 'for', '$', '6', 'million', '.']
['Tesla', 'looking', 'buying', 'U.S.', 'startup', '$', '6', 'million', '.']
