In [None]:
"""
Michael Ramsey
Date Created: 2/4/19
Last Edited: 2/4/19

This is a jupyter notebook to practice NLP. The work performed here is from the book:
    "Applied Natural Language Processing with Python : 
        Implementing Machine Learning and Deep Learning Algorithms for Natural Language Processing"
"""

In [25]:
"""
Load necessary packages
"""
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [28]:
"""
Tokenization and Stop Words
"""

# Create a string for the sample text
sample_text = "I am a student from the University of Alabama. I \
was born in Ontario, Canada and I am a huge fan of the United States. \
I am going to get a degree in Philosophy to improve \
my chances of becoming a Philosophy professor. I have been \
working towards this goal for 4 years. I am currently enrolled \
in a PhD program. It is very difficult, but I am confident that \
it will be a good decision"

# Tokenize by word
sample_word_tokens = word_tokenize(sample_text)

# Tokenize by sentence
sample_sent_tokens = sent_tokenize(sample_text)

# Generate stop words, convert to uppercase - might be better to convert to lower
stop_words = [word.upper() for word in stopwords.words('english')]

# Extract the word tokens that are not in the list of uppercase stop words
word_tokens = [word for word in sample_word_tokens if word.upper() not in stop_words]

# Remove gramatical characters
tokenizer = RegexpTokenizer(r'\w+')
word_tokens = tokenizer.tokenize(str(word_tokens))
word_tokens = [word.lower() for word in word_tokens]

In [32]:
"""
The Bag-of-Words Model (BoW)
"""

# Create function for the bag of words model
def bag_of_words(word_list):
    """
    Function to compute the bag-of-words model on a list of words
    
    INPUTS:
    word_list: A list of strings (words)
    
    OUTPUTS:
    word_dict: A dictionary with each key representing a unique word
               and the vaue corresponding to the number of words
    """
    
    # Initialize word dictionary
    word_dict = {}
    
    # Loop through list of words
    for word in word_list:
        if word in word_dict.keys():
            word_dict[word] += 1
        else:
            word_dict[word] = 1
            
    # Return the word dictionary
    return word_dict

# Implement bag of words model
word_tokens_bow = bag_of_words(word_tokens)
word_tokens_bow

{'student': 1,
 'university': 1,
 'alabama': 1,
 'born': 1,
 'ontario': 1,
 'canada': 1,
 'huge': 1,
 'fan': 1,
 'united': 1,
 'states': 1,
 'going': 1,
 'get': 1,
 'degree': 1,
 'philosophy': 2,
 'improve': 1,
 'chances': 1,
 'becoming': 1,
 'professor': 1,
 'working': 1,
 'towards': 1,
 'goal': 1,
 '4': 1,
 'years': 1,
 'currently': 1,
 'enrolled': 1,
 'phd': 1,
 'program': 1,
 'difficult': 1,
 'confident': 1,
 'good': 1,
 'decision': 1}

In [None]:
# Implemen