### Tokenization and Lemmatization/Stemming in python
 
- The goal of this notebok is to demonstrate the word stemming capabilities of the nltk and spaCy package


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import porter, WordNetLemmatizer


In [None]:
with open('/Users/nmiles/PACMan_dist/libs/stopwords.txt', 'r') as test_file:
    text = test_file.readlines()
    stop_words = [val.strip('\n') for val in text]


In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Read in some example text to play with.

In [None]:
with open('./0896.pdf.txtx', 'r') as test_file:
    text = test_file.readlines()
    text = [val.strip('\n') for val in text]
# text = ' '.join(text) 

In [None]:
text = [val.strip('\n') for val in text]
# text = ' '.join(text) 

In [None]:
text

In [None]:
lexicon = [val.split(' ')[0] for val in text if val != '']

In [None]:
for word in lexicon[:10]:
        print(word)

In [None]:
lexicon = [word_tokenize(word) for word in lexicon if len(word) != 0]

In [None]:
def nltk2wn_tag(nltk_tag):
    """Convenience function for converting NLTK POS tags to wordnet equivalents
    """
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        # If it's unclear what it is, just assume the default [NOUN]
        return wordnet.NOUN

In [None]:
# Determine the proper parts-of-speech tag for each token and convert them from NLTK to wordnet
final_lexicon = []
for lex in lexicon:
    pos_tag = nltk.pos_tag(lex)
    wdnet_pos_tag = nltk2wn_tag(pos_tag[0][1])
    final_lexicon.append((lex[0], wdnet_pos_tag))

In [None]:
final_lexicon[0]

In [None]:
# Use the wordnet lemmatizer 
lemmatizer = WordNetLemmatizer()

In [None]:
# Use the robust Porter Stemmer
stemmer = porter.PorterStemmer()

In [None]:
for lex in final_lexicon[:10]:
    print(f'Word: {lex} \nLemma: {lemmatizer.lemmatize(lex[0], pos=lex[1])}\nStem: {stemmer.stem(lex[0])}\n')

<hr>

Perform the same steps using a class-based approach with spaCy.

- spaCy is different in that it prefers to receive the abstract in a single chunk 

In [None]:
import spacy
from spacy.lang.en import English
import string

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
print(len(nlp.Defaults.stop_words))

In [None]:
print(len(stop_words))

In [None]:
spacy_stop = set(nlp.Defaults.stop_words)
custom_stop = set(stop_words)


In [None]:
missing_stop_words = custom_stop.difference(spacy_stop)
print(len(missing_stop_words))

In [None]:
# combine them into a single list of stop words
nlp.Defaults.stop_words |= set(missing_stop_words)

In [None]:
print(len(nlp.Defaults.stop_words))

In [None]:
with open('./0896.pdf.txtx', 'r') as test_file:
    text = test_file.readlines()
    text = [val.strip('\n') for val in text]
    text = ' '.join(text) 

In [None]:
abstract = nlp(text)
print(len(abstract))

In [None]:
print(dir(abstract))
print(abstract[1])

In [None]:
trim_stop_words = []
autogen_stop_words = []

In [None]:
for token in abstract:
    if token.is_stop:
        autogen_stop_words.append(token)
        continue
    trim_stop_words.append(token)

In [None]:
print(len(trim_stop_words)/len(abstract))

In [None]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(text, return_type='str'):
    # Creating our token object, which contains each word token parsed from the text.
    mytokens = parser(text)
    num_tokens = len(mytokens)
    # Next, lemmatize each token and standardize the capitalization to be lower case
    mytokens = [
        word.lemma_.lower().strip()
        if word.lemma_ != "-PRON-" else word.lower_ 
        for word in mytokens 
    ]

    # Removing stop words and punctuation
    mytokens = [
        word for word in mytokens 
        if word not in stop_words and word not in punctuations
    ]
    print(f"Processed text represents {len(mytokens)/num_tokens:0.2f}% of the input text")
    
    return mytokens

In [None]:
tokens = spacy_tokenizer(text)