# NLTK features and Spelling Recommender

Use nltk to explore the Herman Melville novel Moby Dick and create Spelling Recommenders.

## Setup

In [1]:
import nltk
import pandas as pd
import numpy as np

In [2]:
# download books from nltk
nltk.download('book', quiet=True)

True

In [3]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


## Explore NLTK Features

In [4]:
# word tokenized novel in nltk.Text format
moby = text1
type(moby)

nltk.text.Text

In [5]:
# number of tokens (words and punctuation symbols)
len(moby)

260819

In [6]:
# unique tokens (unique words and punctuation)
len(set(moby))

19317

In [7]:
# lemmatizing the verbs
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w,'v') for w in moby]

# unique tokens after lemmatizing
len(set(lemmatized))

15363

In [8]:
# lexical diversity of the text (i.e. ratio of unique tokens to the total number of tokens)
len(set(moby)) / len(moby)

0.07406285585022564

In [9]:
# percentage of tokens is 'whale'or 'Whale'
((moby.count('whale') + moby.count('Whale')) / len(moby)) * 100

0.455488288813315

In [10]:
# 20 most frequently occurring (unique) tokens in the text with their frequency
from nltk import FreqDist
    
mobyDist = FreqDist(moby)
sorted_x = sorted(mobyDist.items(), key=lambda kv: kv[1], reverse=True)
sorted_x[:20]

[(',', 18713),
 ('the', 13721),
 ('.', 6862),
 ('of', 6536),
 ('and', 6024),
 ('a', 4569),
 ('to', 4542),
 (';', 4072),
 ('in', 3916),
 ('that', 2982),
 ("'", 2684),
 ('-', 2552),
 ('his', 2459),
 ('it', 2209),
 ('I', 2124),
 ('s', 1739),
 ('is', 1695),
 ('he', 1661),
 ('with', 1659),
 ('was', 1632)]

In [11]:
# tokens with length of greater than 5 and frequency of more than 150
freqWords = [w for w in mobyDist.keys() if len(w) > 5 and mobyDist[w] > 150]
sorted(freqWords)

['Captain',
 'Pequod',
 'Queequeg',
 'Starbuck',
 'almost',
 'before',
 'himself',
 'little',
 'seemed',
 'should',
 'though',
 'through',
 'whales',
 'without']

In [12]:
# longest word in moby
longestWord = ''
for word in moby:
    if len(word) > len(longestWord):
        longestWord = word
(longestWord, len(longestWord))

('uninterpenetratingly', 20)

In [13]:
# unique words with frequency of more than 2000
freqWords = [(mobyDist[w], w) for w in mobyDist.keys() if w.isalpha() and mobyDist[w] > 2000]
sorted(freqWords, key=lambda item: item[0], reverse=True)

[(13721, 'the'),
 (6536, 'of'),
 (6024, 'and'),
 (4569, 'a'),
 (4542, 'to'),
 (3916, 'in'),
 (2982, 'that'),
 (2459, 'his'),
 (2209, 'it'),
 (2124, 'I')]

In [14]:
# 5 most frequent parts of speech
posTagList = nltk.pos_tag(moby)
    
result = {}
for item in posTagList:
    if item[1] in result:
        result[item[1]] += 1
    else:
        result[item[1]] = 1
            
sorted_x = sorted(result.items(), key=lambda kv: kv[1], reverse=True)
sorted_x[:5]

[('NN', 36099), ('IN', 28964), ('DT', 25826), ('JJ', 18811), (',', 18713)]

## Spelling Recommender

Create different spelling recommenders, that each take a list of misspelled words and recommends a correctly spelled words.

For every misspelled word, the recommender will find find the word in `correct_spellings` that has the shortest distance*, and starts with the same letter as the misspelled word, and return that word as a recommendation.

Different recommenders will use a different distance measuresbased on [1] and [2]

[1] [Jaccard distance](https://en.wikipedia.org/wiki/Jaccard_index)

[2] [Edit distance on the two words with transpositions.](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)

In [15]:
from nltk.corpus import words
correct_spellings = words.words()

In [16]:
# Jaccard distance on the trigrams of the two words.
def jaccard_trigram_spellrec(entries):
    result = []
    for word in entries:
        closeMatch = (0, 1)
        for item in correct_spellings:
            if item[0] == word[0]:
                distance = nltk.jaccard_distance(set(nltk.ngrams(word, n=3)), set(nltk.ngrams(item, n=3)))
                if distance < closeMatch[1]:
                    closeMatch = (item, distance)
        result.append(closeMatch[0])
    return result
    
jaccard_trigram_spellrec(['cormulent', 'incendenece', 'validrate'])

['corpulent', 'indecence', 'validate']

In [17]:
# Jaccard distance on the 4-grams of the two words.
def jaccard_4gram_spellrec(entries):
    result = []
    for word in entries:
        closeMatch = (0, 1)
        for item in correct_spellings:
            if item[0] == word[0]:
                distance = nltk.jaccard_distance(set(nltk.ngrams(word, n=4)), set(nltk.ngrams(item, n=4)))
                if distance < closeMatch[1]:
                    closeMatch = (item, distance)
        result.append(closeMatch[0])
    return result
    
jaccard_4gram_spellrec(['cormulent', 'incendenece', 'validrate'])

['cormus', 'incendiary', 'valid']

In [18]:
# Edit distance on the two words with transpositions.
def edit_distance_spellrec(entries=['cormulent', 'incendenece', 'validrate']):
    result = []
    for word in entries:
        closeMatch = (None, None)
        for item in correct_spellings:
            if item[0] == word[0]:
                distance = nltk.edit_distance(word, item)
                if closeMatch[1] == None:
                    closeMatch = (item, distance)
                elif distance < closeMatch[1]:
                    closeMatch = (item, distance)
        result.append(closeMatch[0])    
    return result
    
edit_distance_spellrec()

['corpulent', 'intendence', 'validate']