Added a regression test for SimpleGoodTuringProbDist to probability.d…

…octest. Fixed another doctest. All tests in probability.doctest now pass
nltk · Sep 15, 2012 · b19c49b · b19c49b
1 parent 1684063
commit b19c49b
Showing 1 changed file with 24 additions and 10 deletions.
diff --git a/nltk/test/probability.doctest b/nltk/test/probability.doctest
@@ -13,7 +13,7 @@ FreqDist
 
     >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!']
     >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.']
-    
+
     >>> fd1 = nltk.FreqDist(text1)
     >>> fd1.items()
     [('!', 1), ('a', 1), ('anywhere', 1), ('fish', 1), ('goes', 1), ('good', 1), ('no', 1), ('porpoise', 1), ('without', 1)]
@@ -31,7 +31,7 @@ Note that items are sorted in order of decreasing frequency; two items of the sa
     True
     >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged
     True
-    
+
     >>> fd2 = nltk.FreqDist(text2)
     >>> fd1.update(fd2)
     >>> fd1 == both
@@ -41,12 +41,12 @@ Note that items are sorted in order of decreasing frequency; two items of the sa
     >>> fd1.update(text2)
     >>> fd1 == both
     True
-        
+
     >>> fd1 = nltk.FreqDist(text1)
     >>> fd2 = nltk.FreqDist(fd1)
     >>> fd2 == fd1
     True
-    
+
 Testing some HMM estimators
 ---------------------------
 
@@ -65,16 +65,16 @@ from the whole corpus, not just the training corpus
     >>> symbols = list(set([word for sent in corpus for (word,tag) in sent]))
     >>> print len(symbols)
     1464
-    >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) 
+    >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
 
 We divide the corpus into 90% training and 10% testing
 
     >>> train_corpus = []
     >>> test_corpus = []
     >>> for i in range(len(corpus)):
-    ...     if i % 10: 
+    ...     if i % 10:
     ...         train_corpus += [corpus[i]]
-    ...     else: 
+    ...     else:
     ...         test_corpus += [corpus[i]]
     >>> print len(train_corpus)
     450
@@ -107,7 +107,7 @@ Expected Likelihood Estimation (= Lidstone with gamma==0.5)
 Lidstone Estimation, for gamma==0.1, 0.5 and 1
 (the later two should be exactly equal to MLE and ELE above)
 
-    >>> def lidstone(gamma): 
+    >>> def lidstone(gamma):
     ...     return lambda fd, bins: LidstoneProbDist(fd, gamma, bins)
     >>> train_and_test(lidstone(0.1))
     82.51%
@@ -131,9 +131,9 @@ Good Turing Estimation
 
     >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd)
     >>> train_and_test(gt)
-    14.43%
+    0.17%
 
-Remains to be added: 
+Remains to be added:
 - Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist
 
 Squashed bugs
@@ -174,3 +174,17 @@ Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently add er
     >>> brown.categories()
     ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
 
+Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default otherwise any unseen events get a probability of zero, i.e., they don't get smoothed
+
+    >>> from nltk import SimpleGoodTuringProbDist, FreqDist
+    >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10})
+    >>> p = SimpleGoodTuringProbDist(fd)
+    >>> p.prob('a')
+    0.017649766667026317
+    >>> p.prob('o')
+    0.08433050215340411
+    >>> p.prob('z')
+    0.022727272727272728
+    >>> p.prob('foobar')
+    0.022727272727272728
+