## Text Analysis

In [8]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy as sp
import re
import requests

import nltk

from nltk.corpus import webtext

from nose.tools import (
    assert_equal,
    assert_is_instance,
    assert_almost_equal,
    assert_true,
    assert_false
    )

### Tokenize the text

In [92]:
def tokenize(corpus, fileID):
    '''
    Tokenizes the, casting all words to lower case, stripping out punctuation marks, spaces,
    and words not made of one or more alphanumerical characters.
    
    Parameters
    ----------
    corpus: An NLTK corpus
    fileID: A string
    
    Returns
    -------
    words: a list of strings
    '''
    
    words=[word.lower() for word in corpus.words(fileID)]
    pattern = re.compile(r'[^\w\s]')
    words= [i for i in words if not pattern.match(i)]
    return words

In [97]:
monty = tokenize(webtext, 'grail.txt')
assert_is_instance(monty, list)
assert_equal(len(monty), 11602)

assert_true(all(isinstance(w, str) for w in monty))
assert_true(all(all(not c.isupper() for c in w) for w in monty))
assert_true(all(any(c.isalnum() for c in w) for w in monty))

assert_equal(monty[8:13], ['whoa', 'there', 'clop', 'clop', 'clop'])
assert_equal(monty[20:45], ['it', 'is', 'i', 'arthur', 'son', 'of', 'uther', 'pendragon',\
                            'from', 'the', 'castle', 'of', 'camelot', 'king', 'of', 'the',\
                            'britons', 'defeator', 'of', 'the', 'saxons', 'sovereign', 'of', 'all', 'england'])

pirates= tokenize(webtext, 'pirates.txt')
assert_is_instance(pirates, list)
assert_equal(len(pirates), 17143)

assert_true(all(isinstance(w, str) for w in pirates))
assert_true(all(all(not c.isupper() for c in w) for w in pirates))
assert_true(all(any(c.isalnum() for c in w) for w in pirates))

assert_equal(pirates[100:110], ['the', 'left', 'in', 'the', 'barn', 'where', 'the', 'marines', 'enter', 'liz'])
assert_equal(pirates[-10:], ['left', 'shoulder', 'faces', 'the', 'camera', 'and', 'snarls', 'scene', 'end', 'credits'])

### Count words

In [98]:
def count_words(word_ls):
    '''
    Computes the the number of token, number of words, and lexical diversity.
    
    Parameters
    ----------
    word_ls: A list of of strings.
    
    Returns
    -------
    A 3-tuple of (num_tokens, num_words, lex_div) called tup
    num_tokens: An int. The number of tokens in "words".
    num_words: An int. The number of words in "words".
    lex_div: A float. The lexical diversity of "words".
    '''
    
    counts = nltk.FreqDist(word_ls)
    num_words = len(word_ls)
    num_tokens = len(counts)
    lex_div  =  num_words / num_tokens
    return num_tokens, num_words, lex_div

In [105]:
monty_tokens, monty_words, mld = count_words(monty)
assert_is_instance(monty_tokens, int)
assert_is_instance(monty_words, int)
assert_is_instance(mld, float)
assert_equal(monty_tokens, 1823)
assert_equal(monty_words, 11602)
assert_almost_equal(mld, 6.364234777838727)

pirate_tokens, pirate_words, pld = count_words(pirates)
assert_is_instance(pirate_tokens, int)
assert_is_instance(pirate_words, int)
assert_is_instance(pld, float)
assert_equal(pirate_tokens, 2731)
assert_equal(pirate_words, 17143)
assert_almost_equal(pld, 6.277187843280849)

### Most common words

In [113]:
def most_common(words, num_top_words):
    '''
    Takes the output of tokenize and find the most common words within that list,
    returning a list of tuples containing the most common words and their number 
    of occurances.
    
    Parameters
    ----------
    words: A list of strings
    num_top_words:  An int. The number of most common words (and tuples) 
                    that will be returned.
    
    Returns
    -------
    top_words:  A list of tuples, where each tuple contains a word and
                its number of occurances.
    '''
    
    counts = nltk.FreqDist(words)
    result=counts.most_common(num_top_words)
    return result

In [114]:
yarr = most_common(pirates, 5)

assert_is_instance(yarr, list)
assert_true(all(isinstance(t, tuple) for t in yarr))
assert_true(all(isinstance(t, str) for t, f in yarr))
assert_true(all(isinstance(f, int) for t, f in yarr))

assert_equal(len(most_common(pirates, 10)), 10)
assert_equal(len(most_common(pirates, 20)), 20)
assert_equal(yarr, [('the', 1073), ('jack', 470), ('a', 434), ('to', 372), ('of', 285)])

shrubbery = most_common(monty, 5)
assert_is_instance(shrubbery, list)
assert_true(all(isinstance(t, tuple) for t in shrubbery))
assert_true(all(isinstance(t, str) for t, f in shrubbery))
assert_true(all(isinstance(f, int) for t, f in shrubbery))

assert_equal(len(most_common(monty, 15)), 15)
assert_equal(len(most_common(monty, 37)), 37)
assert_equal(shrubbery, [('the', 334), ('you', 265), ('arthur', 261), ('i', 260), ('a', 238)])

### Hapaxes

In [116]:
def hapax(words):
    '''
    Finds all hapaxes from the "words" list of strings.    
    
    Parameters
    ----------
    words: A list of strings
    
    Returns
    -------
    hapax: A list of strings
    
    '''
    
    counts = nltk.FreqDist(words)
    result=counts.hapaxes()
    return result

In [120]:
assert_is_instance(hapax(monty), list)
assert_true(all(isinstance(w, str) for w in hapax(monty)))
assert_equal(len(hapax(monty)), 977)
assert_equal(sorted(hapax(monty))[-5:],['zhiv', 'zone', 'zoo', 'zoop', 'zoosh'])

assert_is_instance(hapax(pirates), list)
assert_true(all(isinstance(w, str) for w in hapax(pirates)))
assert_equal(len(hapax(pirates)), 1433)
assert_equal(sorted(hapax(pirates))[-5:],['yeah', 'yep', 'yours', 'yourselves', 'zooming'])

### Long words

In [121]:
def long_words(words, length=10):
    '''
    Finds all words in "words" longer than "length".
    
    Parameters
    ----------
    corpus: An list of strings.
    length: An int. Default: 10
    
    Returns
    -------
    A list of strings.
    '''
    
    result = [word for word in words if len(word) > length]
    return result

In [123]:
monty_l = long_words(monty, 12)
assert_is_instance(monty_l, list)
assert_true(all(isinstance(w, str) for w in monty_l))    
assert_equal(len(monty_l), 6)
assert_equal(
    set(monty_l),
    set(['unfortunately', 'understanding', 'oooohoohohooo', 'indefatigable', 'camaaaaaargue', 'automatically'])
    )
assert_equal(len(long_words(monty,10)), 68)
assert_equal(len(long_words(monty,11)), 37)


pirate_l = long_words(pirates, 13)
assert_is_instance(pirate_l, list)
assert_true(all(isinstance(w, str) for w in monty_l))    
assert_equal(len(pirate_l), 5)
assert_equal(
    set(pirate_l),
    set(['simultanenously', 'responsibility', 'reconciliatory', 'incapacitorially', 'enthusiastically']))
assert_equal(len(long_words(pirates,10)), 107)
assert_equal(len(long_words(pirates,12)), 29)
