In [19]:
from __future__ import division  # Python 2 users only
import nltk, re, pprint
from nltk import word_tokenize
import warnings
warnings.filterwarnings('ignore')

# Accessing Text from Web or Disk

In [7]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
raw[:75]

'\ufeffThe Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r'

In [10]:
tokens = word_tokenize(raw)
print(tokens[:10])

['\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by']


In [11]:
text = nltk.Text(tokens)
print(type(text))
print(text[1024:1062])
print(text.collocations())

<class 'nltk.text.Text'>
['an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'July', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 'S.', 'Place', 'and', 'walked', 'slowly', ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'K.', 'bridge', '.', 'He', 'had', 'successfully']
Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Ilya Petrovitch; Project
Gutenberg; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens
None


In [13]:
# find a content
print(raw.find('PART I'))
print(raw.rfind("End of Project Gutenberg's Crime"))

5336
-1


In [14]:
raw = raw[5336:-1]
raw.find('PART I')

0

In [15]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
html[:60]

'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [20]:
# get text out of html
from bs4 import BeautifulSoup
raw = BeautifulSoup(html).get_text()
tokens = word_tokenize(raw)
print(tokens[:20])

['BBC', 'NEWS', '|', 'Health', '|', 'Blondes', "'to", 'die', 'out', 'in', '200', "years'", 'NEWS', 'SPORT', 'WEATHER', 'WORLD', 'SERVICE', 'A-Z', 'INDEX', 'SEARCH']


In [21]:
text = nltk.Text(tokens)
text.concordance('gene')

Displaying 7 of 7 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin
er's Polio campaign launched in Iraq Gene defect explains high blood pressure 
er's Polio campaign launched in Iraq Gene defect explains high blood pressure 


### Processing RSS Feeds

In [23]:
import feedparser
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
llog['feed']['title']

'Language Log'

In [27]:
post = llog.entries[2]
post.title

'On beyond the (International Phonetic) Alphabet'

In [32]:
content = post.content[0].value
raw = BeautifulSoup(content).get_text()
print(word_tokenize(raw)[:20])

['The', 'International', 'Phonetic', 'Alphabet', 'is', 'a', 'useful', 'invention', ',', 'which', 'everyone', 'interested', 'in', 'speech', 'sounds', 'should', 'learn', '.', 'But', 'it']


## The NLP Pipeline

![pipeline](http://www.nltk.org/images/pipeline1.png)

## Useful String Methods

|Method|Functionality|
|------|-------------|
|s.find(t)|index of first instance of string t inside s (-1 if not found)|
|s.rfind(t)|index of last instance of string t inside s (-1 if not found)|
|s.index(t)|like s.find(t) except it raises ValueError if not found|
|s.rindex(t)|like s.rfind(t) except it raises ValueError if not found|
|s.join(text)|combine the words of the text into a string using s as the glue|
|s.split(t)|split s into a list wherever a t is found (whitespace by default)|
|s.splitlines()|split s into a list of strings, one per line|
|s.lower()|a lowercased version of the string s|
|s.upper()|an uppercased version of the string s|
|s.title()|a titlecased version of the string s|
|s.strip()|a copy of s without leading or trailing whitespace|
|s.replace(t, u)|replace instances of t with u inside s|

# Text Processing with Unicode
Some encodings (such as ASCII and Latin-2) use a single byte per code point, so they can only support a small subset of Unicode, enough for a single language. Other encodings (such as UTF-8) use multiple bytes and can represent the full range of Unicode characters.

![unicode](http://www.nltk.org/images/unicode.png)

In [34]:
path = nltk.data.find('../nltk_data/corpora/unicode_samples/polish-lat2.txt')
f = open(path,encoding='latin2')
for line in f:
    line = line.strip()
    print(line)

Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą
"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały
odnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki
Jagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych
archiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.


In [35]:
ord('ń')

324

In [39]:
a=hex(324)
a

'0x144'

In [42]:
"\u0144"

'ń'

In [44]:
import unicodedata
lines = open(path,encoding='latin2').readlines()
line = lines[2]
print(line)
print()
print(line.encode('unicode_escape'))

Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały


b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y\\n'


In [46]:
for c in line:
    if ord(c) > 127:
        print('{} U+{:4x} {}'.format(c,ord(c),unicodedata.name(c)))

ó U+  f3 LATIN SMALL LETTER O WITH ACUTE
ś U+ 15b LATIN SMALL LETTER S WITH ACUTE
Ś U+ 15a LATIN CAPITAL LETTER S WITH ACUTE
ą U+ 105 LATIN SMALL LETTER A WITH OGONEK
ł U+ 142 LATIN SMALL LETTER L WITH STROKE


## Regualr Expressions

In [49]:
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
#find words ending with `ed` 
print([w for w in wordlist if re.search('ed$',w)][:20])

['abaissed', 'abandoned', 'abased', 'abashed', 'abatised', 'abed', 'aborted', 'abridged', 'abscessed', 'absconded', 'absorbed', 'abstracted', 'abstricted', 'accelerated', 'accepted', 'accidented', 'accoladed', 'accolated', 'accomplished', 'accosted']


In [51]:
#find an 8-letter word with j as its third lettter and t as its sixth letter
print([w for w in wordlist if re.search('^..j..t..$',w)])

['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter', 'rejector', 'unjilted', 'unjolted', 'unjustly']


In [53]:
#Text on 9 keys
print([w for w in wordlist if re.search('^[ghi][mno][jlk][def]$',w)])

['gold', 'golf', 'hold', 'hole']


In [56]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
print([w for w in chat_words if re.search('^m+i+n+e+$',w)])

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee', 'miiiiiinnnnnnnnnneeeeeeee', 'mine', 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']


### Basic Regular Expression Meta-Characters

|Operator|Behavior|
|--------|--------|
|.|Wildcard, matches any character|
|^abc|Matches some pattern abc at the start of a string|
|abc$|Matches some pattern abc at the end of a string|
|[abc]|Matches one of a set of characters|
|[A-Z0-9]|Matches one of a range of characters|
|ed&#124;ing&#124;s|Matches one of the specified strings (disjunction)|
|*|Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure)|
|+|One or more of previous item, e.g. a+, [a-z]+|
|?|Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?|
|{n}|Exactly n repeats where n is a non-negative integer|
|{n,}|At least n repeats|
|{,n}|No more than n repeats|
|{m,n}|At least m and no more than n repeats|
|a(b&#124;c)+|Parentheses that indicate the scope of the operators|


## Extracting word pieces

In [59]:
## find vowels
word = 'supercalifragilisticexpialidocious'
print(re.findall(r'[aeiou]', word))

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']


In [61]:
wsj = sorted(nltk.corpus.treebank.words())
fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}',word))
fd.most_common(12)

[('io', 2533),
 ('ea', 2228),
 ('ou', 2110),
 ('ai', 1555),
 ('ie', 1107),
 ('ee', 1056),
 ('ia', 995),
 ('oo', 537),
 ('ue', 502),
 ('ei', 448),
 ('au', 383),
 ('ui', 360)]

In [69]:
#The regular expression in our next example matches initial vowel sequences, final vowel sequences, and all consonants;
pattern = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(pattern, word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and


In [74]:
#extract all consonant-vowel sequences from the words
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]',w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [75]:
cv_word_pairs = [(cv,w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]',w)]
cv_index = nltk.Index(cv_word_pairs)
cv_index['su']

['kasuari']

## finding word stems

In [81]:
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['ing']

Here, re.findall() just gave us the suffix even though the regular expression matched the entire word. This is because the parentheses have a second function, to select substrings to be extracted. If we want to use the parentheses to specify the scope of the disjunction, but not to select the material to be output, we have to add ?:, which is just one of many arcane subtleties of regular expressions. Here's the revised version.

In [82]:
re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['processing']

In [83]:
#split the word into stem and suffix
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

[('process', 'ing')]

 `.*` is greedy to consume as much of the input as possible 
`.*?` is non-greedy

In [84]:
print(re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes'))
print(re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes'))

[('processe', 's')]
[('process', 'es')]


In [87]:
#allow empty suffix
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')

[('language', '')]

### Searching Tokenized Text
The angle brackets are used to mark token boundaries, and any whitespace between the angle brackets is ignored

In [98]:
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
#
print(moby.findall(r"<a>(<.*>)<man>"))

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave
None


In [100]:
# finds three-word phrases ending with the word bro
chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*><.*><bro>")

you rule bro; telling you bro; u twizted bro


In [101]:
# finds sequences of three or more words starting with the letter l
chat.findall(r"<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [102]:
# discover hypernyms
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies','learned']))
hobbies_learned.findall(r"<\w*><and><other><\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


# Normalizing Text
## Stemmers

In [103]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
     is no basis for a system of government.  Supreme executive power derives from
     a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)

In [106]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
print([porter.stem(t) for t in tokens])
print()
print([lancaster.stem(t) for t in tokens])

['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']

['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']


In [108]:
class IndexedText(object):

    def __init__(self, stemmer, text):
        self._text = text
        self._stemmer = stemmer
        self._index = nltk.Index((self._stem(word), i)
                                 for (i, word) in enumerate(text))

    def concordance(self, word, width=40):
        key = self._stem(word)
        wc = int(width/4)                # words of context
        for i in self._index[key]:
            lcontext = ' '.join(self._text[i-wc:i])
            rcontext = ' '.join(self._text[i:i+wc])
            ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width)
            rdisplay = '{:{width}}'.format(rcontext[:width], width=width)
            print(ldisplay, rdisplay)

    def _stem(self, word):
        return self._stemmer.stem(word).lower()
    
grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter,grail)
text.concordance('lie')

r king ! DENNIS : Listen , strange women lying in ponds distributing swords is no
 beat a very brave retreat . ROBIN : All lies ! MINSTREL : [ singing ] Bravest of
       Nay . Nay . Come . Come . You may lie here . Oh , but you are wounded !   
doctors immediately ! No , no , please ! Lie down . [ clap clap ] PIGLET : Well  
ere is much danger , for beyond the cave lies the Gorge of Eternal Peril , which 
   you . Oh ... TIM : To the north there lies a cave -- the cave of Caerbannog --
h it and lived ! Bones of full fifty men lie strewn about its lair . So , brave k
not stop our fight ' til each one of you lies dead , and the Holy Grail returns t


## Lemmatization

In [109]:
wnl = nltk.WordNetLemmatizer()
print([wnl.lemmatize(t) for t in tokens])

['DENNIS', ':', 'Listen', ',', 'strange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


# Regualr Expressions for Tokenizing Text

In [110]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
     though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
     well without--Maybe it's always pepper that makes people hot-tempered,'..."""
# split by space
print(re.split(r' ',raw))

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone\n', '', '', '', '', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very\n', '', '', '', '', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]


In [112]:
# split by spaces, tabs, or newlines 
print(re.split(r'[ \t\n]+',raw))

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]


In [113]:
#split the input on anything other than a word character:
#\w equivalent to [a-zA-Z0-9_]
print(re.split(r'\W+',raw))

['', 'When', 'I', 'M', 'a', 'Duchess', 'she', 'said', 'to', 'herself', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', 'I', 'won', 't', 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', 'Soup', 'does', 'very', 'well', 'without', 'Maybe', 'it', 's', 'always', 'pepper', 'that', 'makes', 'people', 'hot', 'tempered', '']


In [117]:
print(re.findall(r'\w+|\S\w*',raw))

["'When", 'I', "'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'I", 'won', "'t", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '-', '-Maybe', 'it', "'s", 'always', 'pepper', 'that', 'makes', 'people', 'hot', '-tempered', ',', "'", '.', '.', '.']


In [118]:
print(re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw))

["'", 'When', "I'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'", 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '--', 'Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered', ',', "'", '...']


### Regular Expressions Symbols

|Symbol|Function|
|------|--------|
|\b|Word boundary (zero width)|
|\d|Any decimal digit (equivalent to [0-9])|
|\D|Any non-digit character (equivalent to [^0-9])|
|\s|Any whitespace character (equivalent to [ \t\n\r\f\v])|
|\S|Any non-whitespace character (equivalent to [^ \t\n\r\f\v])|
|\w|Any alphanumeric character (equivalent to [a-zA-Z0-9_])|
|\W|Any non-alphanumeric character (equivalent to [^a-zA-Z0-9_])|
|\t|The tab character|
|\n|The newline character|

# Segmentation

In [124]:
# Sentence segmentation
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])

['"Nonsense!"',
 'said Gregory, who was very rational when anyone else\nattempted paradox.',
 '"Why do all the clerks and navvies in the\n'
 'railway trains look so sad and tired, so very sad and tired?',
 'I will\ntell you.',
 'It is because they know that the train is going right.',
 'It\n'
 'is because they know that whatever place they have taken a ticket\n'
 'for that place they will reach.',
 'It is because after they have\n'
 'passed Sloane Square they know that the next station must be\n'
 'Victoria, and nothing but Victoria.',
 'Oh, their wild rapture!',
 'oh,\n'
 'their eyes like stars and their souls again in Eden, if the next\n'
 'station were unaccountably Baker Street!"',
 '"It is you who are unpoetical," replied the poet Syme.']


In [125]:
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words

text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
print(segment(text, seg1))
print(segment(text, seg2))

['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']


In [126]:
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = sum(len(word) + 1 for word in set(words))
    return text_size + lexicon_size

from random import randint

def flip(segs, pos):
    return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]

def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs)-1))
    return segs

def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, round(temperature))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print()
    return segs

anneal(text,seg1,5000,1.2)


64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
60 ['doyouseethekitty', 'se', 'ethedoggy', 'doyouli', 'kethekitt', 'ylik', 'ethedoggy']
60 ['doyouseethekitty', 'se', 'ethedoggy', 'doyouli', 'kethekitt', 'ylik', 'ethedoggy']
59 ['doyouseet', 'hekitt', 'y', 'se', 'ethedoggy', 'doyou', 'liket', 'hekitt', 'ylik', 'ethedoggy']
57 ['doyou', 's', 'eet', 'hekitt', 'y', 's', 'e', 'ethedoggy', 'doyou', 'liket', 'hekitt', 'ylik', 'ethedoggy']
57 [

'0000101000000001010000000010000100100000000100100000000'

# Formatting: From lists to Strings

In [127]:
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
' '.join(silly)

'We called him Tortoise because he taught us .'

### Lining things up

In [130]:
# padding to control width of output
print('{:6}'.format(41))
# '<' for left-justiifed
print('{:<6}'.format(41))
# '>' for right-justiifed
print('{:>6}'.format(41))

    41
41    
    41


In [132]:
import math
'{:.4f}'.format(math.pi)

'3.1416'

The string formatting is smart enough to know that if you include a '%' in your format specification, then you want to represent the value as a percentage; there's no need to multiply by 100.

In [133]:
count, total = 3205, 9375
"accuracy for {} words: {:.4%}".format(total, count / total)

'accuracy for 9375 words: 34.1867%'

In [135]:
def tabulate(cfdist, words, categories):
    print('{:16}'.format('Category'), end=' ')                    # column headings
    for word in words:
        print('{:>6}'.format(word), end=' ')
    print()
    for category in categories:
        print('{:16}'.format(category), end=' ')                  # row heading
        for word in words:                                        # for each word
            print('{:6}'.format(cfdist[category][word]), end=' ') # print table cell
        print()                                                   # end the row

from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
           (genre, word)
           for genre in brown.categories()
           for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)

Category            can  could    may  might   must   will 
news                 93     86     66     38     50    389 
religion             82     59     78     12     54     71 
hobbies             268     58    131     22     83    264 
science_fiction      16     49      4     12      8     16 
romance              74    193     11     51     45     43 
humor                16     30      8      8      9     13 


In [137]:
# specify the width of a filed using a variable
'{:{width}}'.format('Monty Python',width=15)

'Monty Python   '

### writing results to a file

In [140]:
output_file = open('output.txt', 'w')
words = set(nltk.corpus.genesis.words('english-kjv.txt'))
for word in sorted(words):
     print(word, file=output_file)
output_file.close()