# 3. 加工原料文本
1. 我们怎样才能编写程序访问本地和网络上的文件，从而获得无限的语言材料？
2. 我们如何把文档分割成单独的词和标点符号，这样我们就可以开始像前面章节中在文本语料上做的那样的分析？
3. 我们怎样编程程序产生格式化的输出，并把结果保存在一个文件中？

## 3.2 字符串：最底层的文本处理

### 字符串的基本操作

In [1]:
monty = 'Monty Python' # 基本字符串
print(monty)
print("================================================================================")
circus = 'Monty Python\'s Flying Circus' # 转义字符
print(circus)
print("================================================================================")
couplet = "Shall I compare thee to a Summer's day?"\
 "Thou are more lovely and more temperate:" # 多行
print(couplet)
print("================================================================================")
couplet2 = """Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:""" # 分行
print(couplet2)
print("================================================================================")
v = 'very' + 'very' + 'very' # 加号连接
print(v)

Monty Python
Monty Python's Flying Circus
Shall I compare thee to a Summer's day?Thou are more lovely and more temperate:
Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:
veryveryvery


In [2]:
# 索引
print(monty[0])
print(monty[-1])
print(monty[0:5])

M
n
Monty


In [3]:
# 字符串匹配
monty.find('Python')

6

In [4]:
# 字符计数
import nltk
from nltk.corpus import gutenberg
raw = gutenberg.raw('melville-moby_dick.txt')
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
fdist.keys()

dict_keys(['m', 'o', 'b', 'y', 'd', 'i', 'c', 'k', 'h', 'e', 'r', 'a', 'n', 'l', 'v', 't', 'g', 's', 'u', 'p', 'w', 'x', 'q', 'f', 'j', 'z'])

In [5]:
# 链表与字符串差异
query = 'Who knows?'
beatles = ['John', 'Paul', 'George', 'Ringo']
# 链表的可变性
print(beatles)
beatles[0] = 'me'
print(beatles)

['John', 'Paul', 'George', 'Ringo']
['me', 'Paul', 'George', 'Ringo']


## 3.4 使用正则表达式检测词组搭配

### 从文件中提取已编码文本

In [6]:
import nltk
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [7]:
# 以ed结尾的词
print([w for w in wordlist if re.search('ed$', w)][10:16])

['absorbed', 'abstracted', 'abstricted', 'accelerated', 'accepted', 'accidented']


In [8]:
# 第3个字母是j，第6个字母是t，长度为8的单词
print([w for w in wordlist if re.search('^..j..t..$', w)])

['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter', 'rejector', 'unjilted', 'unjolted', 'unjustly']


In [9]:
# 中括号[]表示其中字母的一个
print([w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)])

['gold', 'golf', 'hold', 'hole']


In [10]:
# +表示中间可以有表示前面的字符重复一次或以上；*代表零次或以上
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
print([w for w in chat_words if re.search('^m+i+n+e+$', w)])

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee', 'miiiiiinnnnnnnnnneeeeeeee', 'mine', 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']


In [11]:
# 转移字符的使用
wsj = sorted(set(nltk.corpus.treebank.words()))
print([w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)][10:16])

['0.50', '0.54', '0.56', '0.60', '0.7', '0.82']


In [12]:
# {}大括号表示制定重复次数，可能为一个数字，也可能为一个上下限范围
print([w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)][10:16])

['20-point', '20-stock', '21-month', '237-seat', '240-page', '27-year']


In [13]:
# 小括号(|)表示|前面或后面的内容
print([w for w in wsj if re.search('(ed|ing)$', w)][10:16])

['Arbitrage-related', 'Arbitraging', 'Asked', 'Assuming', 'Atlanta-based', 'Baking']


使用r'string'来表示正则表达式

## 3.5 正则表达式的有益应用

### 提取字符块

In [14]:
word = 'supercalifragilisticexpialidocious'
print(re.findall(r'[aeiou]', word))

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']


In [15]:
# 使用正则表达式提取连续两个元音字母
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
    for vs in re.findall(r'[aeiou]{2,}', word))
print(dict(fd.items()))

{'ea': 476, 'oi': 65, 'ou': 329, 'io': 549, 'ee': 217, 'ie': 331, 'ui': 95, 'ua': 109, 'ai': 261, 'ue': 105, 'ia': 253, 'ei': 86, 'iai': 1, 'oo': 174, 'au': 106, 'eau': 10, 'oa': 59, 'oei': 1, 'oe': 15, 'eo': 39, 'uu': 1, 'eu': 18, 'iu': 14, 'aii': 1, 'aiia': 1, 'ae': 11, 'aa': 3, 'oui': 6, 'ieu': 3, 'ao': 6, 'iou': 27, 'uee': 4, 'eou': 5, 'aia': 1, 'uie': 3, 'iao': 1, 'eei': 2, 'uo': 8, 'uou': 5, 'eea': 1, 'ueui': 1, 'ioa': 1, 'ooi': 1}


### 在字符块上做更多事情

In [16]:
# 保留以元音字母开始的、以元音字母结束的或没有元音字母的
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and


In [17]:
# 包含ptksvr+元音字母的单词
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [18]:
cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs) # 对key重复的字典，将同一key下的所有values放到一个列表中

In [19]:
print(cv_index['ka'][10:16])

['kaakaoko', 'kaakasi', 'kaakasi', 'kaakau', 'kaakau', 'kaakauko']


### 查找词干

In [20]:
print(re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')) # 找到词缀(如果添加括号，那么只显示括号中的部分)
print(re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')) # 找到整个单词，注意?:的形式
print(re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')) # 找到词干和词缀(两个括号，会返回两个匹配)

['ing']
['processing']
[('process', 'ing')]


In [21]:
print(re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')) # 贪婪查找，先尽可能匹配前面位置的规则
print(re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')) # 非贪婪查找，通过问号表示前面的内容是可选的

[('processe', 's')]
[('process', 'es')]


In [22]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

raw = """DENNIS: Listen, strange women lying in ponds distributing swords
    is no basis for a system of government. Supreme executive power derives from
    a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)
print([stem(t) for t in tokens][10:16])

['sword', 'i', 'no', 'basi', 'for', 'a']


### 搜索已分词文本

In [23]:
# 匹配两个<>中间的单词部分(如果添加括号，那么只显示括号中的部分)
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a> (<.*>) <man>")

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [24]:
# 匹配<>单词前的两个字符
chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*> <.*> <bro>")

you rule bro; telling you bro; u twizted bro


In [25]:
# 连续三个以上符合形式的单词
chat.findall(r"<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [26]:
# 查找X and other Ys形式
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


## 3.6 规范化文本

### 词干提取器

In [27]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
    is no basis for a system of government. Supreme executive power derives from
    a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)
# 两种词干提取器
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

In [28]:
print([porter.stem(t) for t in tokens])
print([lancaster.stem(t) for t in tokens])

['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']
['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']


In [29]:
# 索引上下文本
class IndexedText(object):

    def __init__(self, stemmer, text):
        self._text = text # 文本内容
        self._stemmer = stemmer # 使用的词干提取器
        self._index = nltk.Index((self._stem(word), i)
                                 for (i, word) in enumerate(text)) # 生成(词干,索引)的组合

    def concordance(self, word, width=40): # 找到上下文函数：需要指定单词和上下文宽度
        key = self._stem(word) # key为单词经过词干提取后的形式
        wc = int(width/4)                # 上下文的词数
        for i in self._index[key]: # 尽管单词形式可以不同，但只要满足词干相同，就会被匹配到
            lcontext = ' '.join(self._text[i-wc:i])
            rcontext = ' '.join(self._text[i:i+wc])
            ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width)
            rdisplay = '{:{width}}'.format(rcontext[:width], width=width)
            print(ldisplay, rdisplay)

    def _stem(self, word): # 词干提取函数(并转化成小写)
        return self._stemmer.stem(word).lower()
porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
text.concordance('lie', 20)

sten , strange women lying in ponds distr
etreat . ROBIN : All lies ! MINSTREL : [ 
    . Come . You may lie here . Oh ,     
     , no , please ! Lie down . [ clap   
 for beyond the cave lies the Gorge of Et
: To the north there lies a cave -- the  
es of full fifty men lie strewn about its
 til each one of you lies dead , and the 


### 词形归并

In [30]:
# 词性冰柜不同于词干提取，它会展示词在字典中的出现形式。例如lying因为在字典中存在所以不会被处理，但名词复数会被还原为单数
wnl = nltk.WordNetLemmatizer()
[(wnl.lemmatize(t),t) for t in tokens][:10]

[('DENNIS', 'DENNIS'),
 (':', ':'),
 ('Listen', 'Listen'),
 (',', ','),
 ('strange', 'strange'),
 ('woman', 'women'),
 ('lying', 'lying'),
 ('in', 'in'),
 ('pond', 'ponds'),
 ('distributing', 'distributing')]

## 3.7 用正则表达式为文本分词

### 分词的简单方法(匹配或分割)

In [31]:
# 利用分隔符的简单方法
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
    though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
    well without--Maybe it's always pepper that makes people hot-tempered,'..."""
print(re.split(r'[ :\t\n]+', raw)) # 输入分隔符

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]


In [32]:
print(re.split(r'\W+', raw)) # 排除所有非字母形式

['', 'When', 'I', 'M', 'a', 'Duchess', 'she', 'said', 'to', 'herself', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', 'I', 'won', 't', 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', 'Soup', 'does', 'very', 'well', 'without', 'Maybe', 'it', 's', 'always', 'pepper', 'that', 'makes', 'people', 'hot', 'tempered', '']


In [33]:
print(re.findall(r'\w+|\S\w*', raw)) # 找到全部为字母，或者以非空白形式开头+字母的形式

["'When", 'I', "'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'I", 'won', "'t", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '-', '-Maybe', 'it', "'s", 'always', 'pepper', 'that', 'makes', 'people', 'hot', '-tempered', ',', "'", '.', '.', '.']


In [34]:
print(re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw)) # 匹配连接符形式、'、多个字符、或者以非空白形式开头+字母的形式

["'", 'When', "I'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'", 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '--', 'Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered', ',', "'", '...']


### NLTK 的正则表达式分词器

In [35]:
pattern = r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*"
print(nltk.regexp_tokenize(raw, pattern))

["'", 'When', "I'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'", 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '--', 'Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered', ',', "'", '...']


## 3.8 分割

### 断句

In [36]:
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)
for i in sents[79:85]: 
    print(i)

"Nonsense!"
said Gregory, who was very rational when anyone else
attempted paradox.
"Why do all the clerks and navvies in the
railway trains look so sad and tired, so very sad and tired?
I will
tell you.
It is because they know that the train is going right.
It
is because they know that whatever place they have taken a ticket
for that place they will reach.


### 分词

In [37]:
# 分割字符
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words

text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
segment(text, seg1)

# 评估指标
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = sum(len(word) + 1 for word in set(words))
    return text_size + lexicon_size
evaluate(text, seg1)

# 模拟退火算法
from random import randint

# 更换一个位置的0-1值
def flip(segs, pos):
    return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]

# 更换n个位置的0-1值，其中n由当前温度决定，更换的位置随机的
def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs)-1))
    return segs

# 退火模拟算法
def anneal(text, segs, iterations, cooling_rate): 
    '''
    text: 文本内容
    segs: 字位串, 长度为len(text)-1
    iterations: 每个温度下的迭代次数
    cooling_rate: 冷却速率, 越大温度下降越快, 模拟轮数也就越少
    '''
    temperature = float(len(segs)) # 初始温度为字位串的长度
    while temperature > 0.5: 
        best_segs, best = segs, evaluate(text, segs) # 当前最佳分割法
        for i in range(iterations): # 在当前温度下开始迭代
            guess = flip_n(segs, round(temperature)) # 随机翻转几个位置，进行猜测
            score = evaluate(text, guess) # 检验猜测结果
            if score < best:
                best, best_segs = score, guess # 如果效果提升，用本次猜测结果来替代最佳结果
                print(evaluate(text, best_segs), segment(text, best_segs))
        score, segs = best, best_segs # 当前温度下的迭代结束，更新最佳得分与字位串
        temperature = temperature / cooling_rate # 冷却，并准备下一轮迭代
    return segs
anneal(text, seg1, 5000, 1.2)

62 ['doyo', 'useet', 'hek', 'itty', 'seeth', 'edoggy', 'doyouliket', 'hek', 'itty', 'liketh', 'edoggy']
59 ['doyou', 'seet', 'he', 'k', 'itty', 'seethedoggy', 'doyou', 'likethe', 'k', 'itty', 'likethe', 'doggy']
57 ['do', 'you', 'seet', 'he', 'k', 'itty', 'seet', 'he', 'doggy', 'doyou', 'liket', 'he', 'k', 'itty', 'liket', 'he', 'doggy']
55 ['doy', 'ou', 'seet', 'he', 'k', 'itty', 'seet', 'he', 'doggy', 'doy', 'ou', 'l', 'iket', 'he', 'k', 'itty', 'l', 'iket', 'he', 'doggy']
52 ['do', 'you', 'seet', 'he', 'k', 'itty', 'seet', 'he', 'doggy', 'do', 'you', 'liket', 'he', 'k', 'itty', 'liket', 'he', 'doggy']
49 ['do', 'you', 'seet', 'he', 'kitty', 'seet', 'he', 'doggy', 'do', 'you', 'liket', 'he', 'kitty', 'liket', 'he', 'doggy']
46 ['doyou', 'seet', 'he', 'kitty', 'seet', 'he', 'doggy', 'doyou', 'liket', 'he', 'kitty', 'liket', 'he', 'doggy']


'0000100010100001000101000010000100001010000100001010000'

## 3.9 格式化：从链表到字符串

In [38]:
# 从链表到字符串
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
print(' '.join(silly))

We called him Tortoise because he taught us .


In [39]:
# 字符串格式化表达式
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in sorted(fdist):
    print(word, '->', fdist[word], end='; ') # 原始方法
print(' ')
for word in sorted(fdist):
    print('{}->{};'.format(word, fdist[word]), end=' ') # format形式

cat -> 3; dog -> 4; snake -> 1;  
cat->3; dog->4; snake->1; 

In [40]:
print('from {1} to {0}'.format('A', 'B')) # 设置索引
print('{0:4}'.format(41)) # 设置宽度
count, total = 3205, 9375
print("accuracy for {} words: {:.4%}".format(total, count / total)) # 设置数据类型

from B to A
  41
accuracy for 9375 words: 34.1867%


In [41]:
# 写入csv
output_file = open('output.txt', 'w')
words = set(nltk.corpus.genesis.words('english-kjv.txt'))
for word in sorted(words):
    print(word, file=output_file)

In [42]:
# 文本换行
from textwrap import fill
saying = ['After', 'all', 'is', 'said', 'and', 'done', ',',
            'more', 'is', 'said', 'than', 'done', '.']
format = '%s (%d),'
pieces = [format % (word, len(word)) for word in saying]
output = ' '.join(pieces)
print(output) # 直接输出
wrapped = fill(output)
print(wrapped) # 换行输出

After (5), all (3), is (2), said (4), and (3), done (4), , (1), more (4), is (2), said (4), than (4), done (4), . (1),
After (5), all (3), is (2), said (4), and (3), done (4), , (1), more
(4), is (2), said (4), than (4), done (4), . (1),


In [43]:
# 结构化输出
def tabulate(cfdist, words, categories):
    print('{:16}'.format('Category'), end=' ')                    # 第一列的列名
    for word in words:
        print('{:>6}'.format(word), end=' ')                      # 其他列的列名
    print()
    for category in categories:
        print('{:16}'.format(category), end=' ')                  # 每一行的行名
        for word in words:                                        
            print('{:6}'.format(cfdist[category][word]), end=' ') # 每一行的值
        print()                                                   
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
           (genre, word)
           for genre in brown.categories()
           for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)

Category            can  could    may  might   must   will 
news                 93     86     66     38     50    389 
religion             82     59     78     12     54     71 
hobbies             268     58    131     22     83    264 
science_fiction      16     49      4     12      8     16 
romance              74    193     11     51     45     43 
humor                16     30      8      8      9     13 
