# Import Libraries

In [193]:
import numpy as np
import pandas as pd
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.util import trigrams
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import ssl

In [177]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('vader_lexicon')
nltk.download("wordnet")
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lenguyen/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lenguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/lenguyen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lenguyen/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Import Data

Project Gutenberg: https://www.gutenberg.org/


We load in the texts Alice in Wonderland, Moby Dick, and Romeo and Juliet

In [22]:
aliceData = np.loadtxt('AliceInWonderland.txt', delimiter = "\n",  dtype = str)
mobyData = np.loadtxt('MobyDick.txt', delimiter = "\n",  dtype = str)
romeoData = np.loadtxt('RomeoAndJuliet.txt', delimiter = "\n",  dtype = str)

In [25]:
#look at our data
aliceData[55:65]

array(['large rabbit-hole under the hedge.',
       'In another moment down went Alice after it, never once considering how',
       'in the world she was to get out again.',
       'The rabbit-hole went straight on like a tunnel for some way, and then',
       'dipped suddenly down, so suddenly that Alice had not a moment to think',
       'about stopping herself before she found herself falling down a very',
       'deep well.',
       'Either the well was very deep, or she fell very slowly, for she had',
       'plenty of time as she went down to look about her and to wonder what',
       'was going to happen next. First, she tried to look down and make out'],
      dtype='<U82')

# Clean Data

## Tokenize

First we must tokenize the data, which means we need to take it out of long string sentance format into individual words which will be our tokens

In [96]:
tokenAlice = [i.replace('.','').replace(',','').split(" ") for i in aliceData]
tokenMoby = [i.replace('.','').replace(',','').split(" ") for i in mobyData]
tokenRomeo = [i.replace('.','').replace(',','').split(" ") for i in romeoData]

In [97]:
tokenAlice[55]

['large', 'rabbit-hole', 'under', 'the', 'hedge']

## Stop Word Removal

Next, we remove articles in our sentences, such as "is", "and", "the" - they will not me helpful in our analysis 

In [79]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [80]:
cleanAlice = []
cleanMoby = []
cleanRomeo = []

for sentence in tokenAlice:
    cleanAlice.append([i.lower() for i in sentence if i.lower() not in stopwords.words("english")])
    
for sentence in tokenMoby:
    cleanMoby.append([i.lower() for i in sentence if i.lower() not in stopwords.words("english")])

for sentence in tokenRomeo:
    cleanRomeo.append([i.lower() for i in sentence if i.lower() not in stopwords.words("english")])
    

In [81]:
cleanAlice[55:65]

[['large', 'rabbit-hole', 'hedge'],
 ['another', 'moment', 'went', 'alice', 'never', 'considering'],
 ['world', 'get'],
 ['rabbit-hole', 'went', 'straight', 'like', 'tunnel', 'way'],
 ['dipped', 'suddenly', 'suddenly', 'alice', 'moment', 'think'],
 ['stopping', 'found', 'falling'],
 ['deep', 'well'],
 ['either', 'well', 'deep', 'fell', 'slowly'],
 ['plenty', 'time', 'went', 'look', 'wonder'],
 ['going', 'happen', 'next', 'first', 'tried', 'look', 'make']]

## Lemmatization and Stemming

Lemmatization and Stemming are both methods of removing prefixes and suffixes from words. Stemming is the most brute force and removes the prefix/suffix while Lemmatization replaces the word with the root word.

In [82]:
lemmatizer = WordNetLemmatizer()

print("Stemming:")
print("studies:", PorterStemmer().stem("studies") + "\n")
print("Lemmatization")
print("studies:", lemmatizer.lemmatize("studies"))

Stemming:
studies: studi

Lemmatization
studies: study


In [83]:
lemmaAlice = []
lemmaMoby = []
lemmaRomeo = []

for sentence in cleanAlice:
    lemmaAlice.append([lemmatizer.lemmatize(i) for i in sentence])
    
for sentence in cleanMoby:
    lemmaMoby.append([lemmatizer.lemmatize(i) for i in sentence])

for sentence in cleanRomeo:
    lemmaRomeo.append([lemmatizer.lemmatize(i) for i in sentence])
    

# Analysis

## Term Frequency

In [115]:
def getWordCount(words):
    word_count = {}
    for word in words:
        if word not in word_count:
            word_count[word] = 1
        elif word in word_count:
            word_count[word] += 1
    word_count.pop('')
    return sorted(word_count.items(), key=lambda x: x[1], reverse=True)

In [136]:
tfAlice = getWordCount(np.concatenate(lemmaAlice))
tfMoby = getWordCount(np.concatenate(lemmaMoby))
tfRomeo = getWordCount(np.concatenate(lemmaRomeo))

In [150]:
tfAlice[0:10]

[('said', 450),
 ('alice', 351),
 ('little', 124),
 ('“i', 119),
 ('one', 87),
 ('work', 85),
 ('project', 83),
 ('went', 83),
 ('like', 77),
 ('could', 75)]

In [151]:
tfMoby[0:10]

[('whale', 1067),
 ('one', 883),
 ('like', 569),
 ('upon', 564),
 ('old', 438),
 ('would', 424),
 ('ship', 417),
 ('time', 412),
 ('sea', 397),
 ('ahab', 371)]

In [152]:
tfRomeo[0:10]

[('romeo', 281),
 ('thou', 275),
 ('juliet', 175),
 ('thy', 170),
 ('capulet', 139),
 ('nurse', 134),
 ('thee', 130),
 ('love', 120),
 ('come', 120),
 ('shall', 112)]

## Part of Speech Tagging

Look up all part of speech tags in NLTK: https://www.guru99.com/pos-tagging-chunking-nltk.html

In [186]:
mobyTags = nltk.pos_tag([i[0] for i in tfMoby])

In [191]:
#sort by most used verb
[i for i in mobyTags if i[-1] == "VB"][0:10]

[('ship', 'VB'),
 ('little', 'VB'),
 ('say', 'VB'),
 ('u', 'VB'),
 ('thought', 'VB'),
 ('let', 'VB'),
 ('tell', 'VB'),
 ('mate', 'VB'),
 ('keep', 'VB'),
 ('talk', 'VB')]

## Embeddings



In [195]:
vectorizer = CountVectorizer()

In [198]:
x.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])