In [1]:
import re

from autosuggest import AutoSuggester, AutoSuggesterFitter, Corpus

In [2]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
class NLTKCorpus(Corpus):
    
    def __init__(self, texts):
        self.texts = texts
        
    def __iter__(self):
        for tokens in self.iter_tokens():
            yield tokens
        
    def iter_tokens(self):
        alpha_rgx = re.compile(".*[a-z]+.*")
        sentence_terminators = {".", "?", "!"}
        for text in self.texts:
            tokens = []
            for token in text:
                token = token.lower()
                if token in sentence_terminators:
                    if tokens:
                        yield tokens
                    tokens = []
                elif alpha_rgx.match(token):
                    tokens.append(token)

            if tokens:
                yield tokens

In [4]:
test_examples = [
    "I am going to the",
    "He is going to the",
    "going to the",
    "to the",
    "the",
    "we need",
    "i think you should meet",
    "should meet",
    "meet",
    "Do you have any",
    "They all have different"
]

Test Out of Distribution Learning

In [5]:
text1_corpus = NLTKCorpus([text1])
text2_corpus = NLTKCorpus([text2])
text3_corpus = NLTKCorpus([text3])
text4_corpus = NLTKCorpus([text4])
text5_corpus = NLTKCorpus([text5])
train_corpus = NLTKCorpus([text1, text2, text3, text4, text5])
test_corpus = NLTKCorpus([text6, text7, text8, text9])

In [6]:
fitter = AutoSuggesterFitter(5, 2)

In [7]:
suggester = fitter.fit([text1_corpus, text2_corpus, text3_corpus, text4_corpus, text5_corpus])

loading folds
getting grid
getting grid values for n-grams of size 3
getting grid values for n-grams of size 4
getting grid values for n-grams of size 5
Fitting 8 grid points
weights = [1, 1, 1, 1]
score = 0.18629198011141268
weights = [1, 1, 1, 4469.0]
score = 0.18646808921935928
weights = [1, 1, 4469.0, 1]
score = 0.1862736206897046
weights = [1, 1, 4469.0, 4469.0]
score = 0.18627006583278602
weights = [1, 4469.0, 1, 1]
score = 0.1850302903466768
weights = [1, 4469.0, 1, 4469.0]
score = 0.18509750755917803
weights = [1, 4469.0, 4469.0, 1]
score = 0.18499563364123423
weights = [1, 4469.0, 4469.0, 4469.0]
score = 0.1849748789910675


In [8]:
suggester.weights

[1, 1, 1, 4469.0]

In [9]:
print("Baseline training set score: {}".format(fitter.baseline_score(suggester, train_corpus)))
print("Training set score: {}".format(fitter.score(suggester, train_corpus)))
print("Baseline test set score: {}".format(fitter.baseline_score(suggester, test_corpus)))
print("Test set score: {}".format(fitter.score(suggester, test_corpus)))

Baseline training set score: 0.17130657332158974
Training set score: 0.9308323541781792
Baseline test set score: 0.15489645393938567
Test set score: 0.18523769921880326


In [10]:
for sentence in test_examples:
    print("Sentence: {}".format(sentence))
    print("Suggestions: {}".format(suggester.suggest(sentence.lower().split(), 5)))
    print()

Sentence: I am going to the
Suggestions: [('world', 426), ('whale', 421), ('same', 388), ('people', 319), ('most', 273)]

Sentence: He is going to the
Suggestions: [('world', 426), ('whale', 421), ('same', 388), ('people', 319), ('most', 273)]

Sentence: going to the
Suggestions: [('world', 426), ('whale', 421), ('same', 388), ('people', 319), ('most', 273)]

Sentence: to the
Suggestions: [('world', 426), ('whale', 421), ('same', 388), ('people', 319), ('most', 273)]

Sentence: the
Suggestions: [('world', 410), ('whale', 407), ('same', 381), ('people', 299), ('most', 268)]

Sentence: we need
Suggestions: [('to', 24), ('not', 23), ('of', 16), ('a', 14), ('for', 7)]

Sentence: i think you should meet
Suggestions: [('the', 21), ('him', 11), ('you', 7), ('with', 6), ('them', 6)]

Sentence: should meet
Suggestions: [('the', 21), ('him', 11), ('you', 7), ('with', 6), ('them', 6)]

Sentence: meet
Suggestions: [('the', 21), ('him', 10), ('you', 7), ('with', 6), ('them', 6)]

Sentence: Do you h

Test In Distribution Learning

In [11]:
class NLTKSampleCorpus(NLTKCorpus):
    
    def __init__(self, texts, hash_select_func):
        self.hash_select_func = hash_select_func
        super().__init__(texts)
        
    def __iter__(self):
        for tokens in self.iter_tokens():
            if self.hash_select_func(hash(tuple(tokens))):
                yield tokens

In [12]:
all_texts = [text1, text2, text3, text4, text5, text6, text7, text8, text9]
train_folds_corpi = [
    NLTKSampleCorpus(all_texts, lambda h: 16 * i <= h % 100 < 16 * (i+1))
    for i in range(5)
]
train_corpus = NLTKSampleCorpus(all_texts, lambda h: 0 <= h % 100 < 80)
test_corpus = NLTKSampleCorpus(all_texts, lambda h: 80 <= h % 100 < 100)

In [13]:
fitter = AutoSuggesterFitter(5, 2)

In [14]:
suggester = fitter.fit(train_folds_corpi)

loading folds
getting grid
getting grid values for n-grams of size 3
getting grid values for n-grams of size 4
getting grid values for n-grams of size 5
Fitting 8 grid points
weights = [1, 1, 1, 1]
score = 0.6197858988556663
weights = [1, 1, 1, 858.0]
score = 0.9387043189368771
weights = [1, 1, 858.0, 1]
score = 0.952390180878553
weights = [1, 1, 858.0, 858.0]
score = 0.9545588778146918
weights = [1, 858.0, 1, 1]
score = 0.8769564414913253
weights = [1, 858.0, 1, 858.0]
score = 0.9193613879660392
weights = [1, 858.0, 858.0, 1]
score = 0.9198320413436691
weights = [1, 858.0, 858.0, 858.0]
score = 0.9380490956072352


In [15]:
suggester.weights

[1, 1, 858.0, 858.0]

In [16]:
print("Baseline training set score: {}".format(fitter.baseline_score(suggester, train_corpus)))
print("Training set score: {}".format(fitter.score(suggester, train_corpus)))
print("Baseline test set score: {}".format(fitter.baseline_score(suggester, test_corpus)))
print("Test set score: {}".format(fitter.score(suggester, test_corpus)))

Baseline training set score: 0.16802194699733278
Training set score: 0.3706936261580867
Baseline test set score: 0.16669433836958325
Test set score: 0.2216431216568372


In [17]:
for sentence in test_examples:
    print("Sentence: {}".format(sentence))
    print("Suggestions: {}".format(suggester.suggest(sentence.lower().split(), 5)))
    print()

Sentence: I am going to the
Suggestions: [('world', 380), ('same', 320), ('most', 310), ('other', 310), ('whale', 290)]

Sentence: He is going to the
Suggestions: [('world', 380), ('same', 320), ('most', 310), ('other', 310), ('whale', 290)]

Sentence: going to the
Suggestions: [('world', 380), ('same', 320), ('most', 310), ('other', 310), ('whale', 290)]

Sentence: to the
Suggestions: [('world', 380), ('same', 320), ('most', 310), ('other', 310), ('whale', 290)]

Sentence: the
Suggestions: [('world', 365), ('same', 310), ('most', 300), ('other', 280), ('whale', 275)]

Sentence: we need
Suggestions: [('not', 30), ('of', 20), ('to', 20), ('some', 15), ('a', 15)]

Sentence: i think you should meet
Suggestions: [('you', 15), ('with', 10), ('him', 10), ('the', 10), ('his', 5)]

Sentence: should meet
Suggestions: [('you', 15), ('with', 10), ('him', 10), ('the', 10), ('his', 5)]

Sentence: meet
Suggestions: [('you', 15), ('with', 10), ('him', 10), ('the', 10), ('his', 5)]

Sentence: Do you h