In [1]:
# classify part of speech based on sentence context
import nltk
from nltk.corpus import brown

# define features for the "i"th word in the sentence, including three types of suffix 
#     and one pre-word
# the pos features function takes the sentence of untagged words and the index of a word i
#   it creates features for word i, including the previous word i-1
def pos_features(sentence, i):    
    features = {"suffix(1)": sentence[i][-1:],
		    "suffix(2)": sentence[i][-2:],
		    "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features 

In [2]:
# look at features of a specific word in a specific sentence
# first sentence of brown corpus
sentence0 = brown.sents()[0]
print(sentence0)
# word 8 of sentence 0
sentence0[8]

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


'investigation'

In [3]:
pos_features(sentence0, 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [4]:
tagged_sents = brown.tagged_sents(categories='news')
tag_sent0 = tagged_sents[0]
tag_sent0

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

In [5]:
nltk.tag.untag(tag_sent0)
for i,(word,tag) in enumerate(tag_sent0):
    print (i, word, tag)

0 The AT
1 Fulton NP-TL
2 County NN-TL
3 Grand JJ-TL
4 Jury NN-TL
5 said VBD
6 Friday NR
7 an AT
8 investigation NN
9 of IN
10 Atlanta's NP$
11 recent JJ
12 primary NN
13 election NN
14 produced VBD
15 `` ``
16 no AT
17 evidence NN
18 '' ''
19 that CS
20 any DTI
21 irregularities NNS
22 took VBD
23 place NN
24 . .


In [6]:
featuresets = []
for tagged_sent in tagged_sents:
	untagged_sent = nltk.tag.untag(tagged_sent)
	for i, (word, tag) in enumerate(tagged_sent):
		featuresets.append( (pos_features(untagged_sent, i), tag) )

# look at the feature sets of the first 10 words
for f in featuresets[:10]:
	print (f)

({'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}, 'NP-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}, 'JJ-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ry', 'suffix(3)': 'ury', 'prev-word': 'Grand'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'id', 'suffix(3)': 'aid', 'prev-word': 'Jury'}, 'VBD')
({'suffix(1)': 'y', 'suffix(2)': 'ay', 'suffix(3)': 'day', 'prev-word': 'said'}, 'NR')
({'suffix(1)': 'n', 'suffix(2)': 'an', 'suffix(3)': 'an', 'prev-word': 'Friday'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}, 'NN')
({'suffix(1)': 'f', 'suffix(2)': 'of', 'suffix(3)': 'of', 'prev-word': 'investigation'}, 'IN')


In [7]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
len(train_set)
len(test_set)

10055

In [8]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy (this will take a little while)
nltk.classify.accuracy(classifier, test_set)
# the result should be 0.78915962207856782, which is reasonable for features without the previous tag

0.7891596220785678

In [2]:
### sentence segmentation
sents = nltk.corpus.treebank_raw.sents()
len(sents)
for sent in sents[:10]:
    print (sent)

['.', 'START']
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.']
['Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.']
['.', 'START']
['Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.']
['.', 'START']
['A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.']
['The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'even'

In [3]:
# initialize an empty token list, an empty boundaries set and offset as the integer 0
tokens = [ ]
boundaries = set()
offset = 0
# make a list of tokens with sentence boundaries
#   the offset is set to the index of a sentence boundary
for sent in nltk.corpus.treebank_raw.sents():
      tokens.extend(sent)
      offset += len(sent)
      boundaries.add(offset - 1)

In [4]:
# look at tokens and boundaries
print(tokens[:40])
print(len(boundaries))
0 in boundaries
1 in boundaries
19 in boundaries
20 in boundaries
for num, tok in enumerate(tokens[:40]):
     print (num, tok, '\t', num in boundaries)

['.', 'START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.', '.', 'START', 'Rudolph']
4193
0 . 	 False
1 START 	 True
2 Pierre 	 False
3 Vinken 	 False
4 , 	 False
5 61 	 False
6 years 	 False
7 old 	 False
8 , 	 False
9 will 	 False
10 join 	 False
11 the 	 False
12 board 	 False
13 as 	 False
14 a 	 False
15 nonexecutive 	 False
16 director 	 False
17 Nov 	 False
18 . 	 False
19 29 	 False
20 . 	 True
21 Mr 	 False
22 . 	 False
23 Vinken 	 False
24 is 	 False
25 chairman 	 False
26 of 	 False
27 Elsevier 	 False
28 N 	 False
29 . 	 False
30 V 	 False
31 ., 	 False
32 the 	 False
33 Dutch 	 False
34 publishing 	 False
35 group 	 False
36 . 	 True
37 . 	 False
38 START 	 True
39 Rudolph 	 False


In [5]:
# feature extraction function
# token is a list of words and we get the features of the token at offset i
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
        'prevword': tokens[i-1].lower(),
        'punct': tokens[i],
        'prev-word-is-one-char': len(tokens[i-1]) == 1}

# feature dictionary for the period at index 20
tokens[20]

'.'

In [6]:
punct_features(tokens,20)

{'next-word-capitalized': True,
 'prev-word-is-one-char': False,
 'prevword': '29',
 'punct': '.'}

In [7]:
# Define featuresets of all candidate punctuation
Sfeaturesets = [(punct_features(tokens, i), (i in boundaries))
      for i in range(1, len(tokens) - 1)
      if tokens[i] in '.?!']

# look at the feature sets of the first 10 punctuation symbols
for sf in Sfeaturesets[:10]:
	print (sf)


({'next-word-capitalized': False, 'prevword': 'nov', 'punct': '.', 'prev-word-is-one-char': False}, False)
({'next-word-capitalized': True, 'prevword': '29', 'punct': '.', 'prev-word-is-one-char': False}, True)
({'next-word-capitalized': True, 'prevword': 'mr', 'punct': '.', 'prev-word-is-one-char': False}, False)
({'next-word-capitalized': True, 'prevword': 'n', 'punct': '.', 'prev-word-is-one-char': True}, False)
({'next-word-capitalized': False, 'prevword': 'group', 'punct': '.', 'prev-word-is-one-char': False}, True)
({'next-word-capitalized': True, 'prevword': '.', 'punct': '.', 'prev-word-is-one-char': True}, False)
({'next-word-capitalized': False, 'prevword': 'conglomerate', 'punct': '.', 'prev-word-is-one-char': False}, True)
({'next-word-capitalized': True, 'prevword': '.', 'punct': '.', 'prev-word-is-one-char': True}, False)
({'next-word-capitalized': True, 'prevword': 'reported', 'punct': '.', 'prev-word-is-one-char': False}, True)
({'next-word-capitalized': True, 'prevword

In [9]:

# separate into training and test sets and build classifier
size = int(len(Sfeaturesets) * 0.1)
size

594

In [10]:
Strain_set, Stest_set = Sfeaturesets[size:], Sfeaturesets[:size]
Sclassifier = nltk.NaiveBayesClassifier.train(Strain_set)
nltk.classify.accuracy(Sclassifier, Stest_set)

0.936026936026936

In [11]:
# this is the . after Nov
Sclassifier.classify(punct_features(tokens, 18))
# this is the . after 29, which should be true!
Sclassifier.classify(punct_features(tokens, 20))
# this is the . after group
Sclassifier.classify(punct_features(tokens, 36))

True

In [12]:

# define function to use the trained classifier to label sentences
def segment_sentences(words):
      start = 0
      sents = []
      for i, word in enumerate(words):
          if word in '.?!' and i < len(words) - 1 and Sclassifier.classify(punct_features(words, i)) == True:
              sents.append(words[start:i+1])
              start = i+1
      if start < len(words):
          sents.append(words[start:])
      return sents

In [13]:

print(len(tokens))
tokens[:50]
tinytokens = tokens[:1000]

for s in segment_sentences(tinytokens):
    print (s)

101797
['.']
['START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.']
['.', 'START', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.', '.', 'START', 'A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.']
['The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'ev

In [14]:
# compare to NLKT default sentence tokenizer, which works on raw text instead of tokens
from nltk.tokenize import sent_tokenize

rawtext = 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.  Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.'
sents = nltk.sent_tokenize(rawtext)
sents

['Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.',
 'Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.']