# Naive Bayes Example

In [3]:
from nltk.corpus import inaugural
from naive_bayes import Model, tokenize

Generate the data set

In [9]:
cls = ['Obama', 'Trump']
obama_sentences = inaugural.sents('2009-Obama.txt')
trump_sentences = inaugural.sents('2017-Trump.txt')
labelled_obama = [(s, cls[0]) for s in obama_sentences]
labelled_trump = [(s, cls[1]) for s in trump_sentences]
labelled_data = labelled_obama + labelled_trump
print("This is an example of a labelled data point: {}".format(labelled_data[-1]))

This is an example of a labelled data point: (['God', 'bless', 'America', '.'], 'Trump')


Initialize and train the model.

In [10]:
model = Model(labelled_data, cls)
model.train()

Log Priors

In [11]:
model.display_log_prior()

{'Obama': -0.8508565606941906, 'Trump': -1.16635838642212}


Vocabulary (display first 10 words...)

In [12]:
model.display_vocab(10)

['inventive', 'drawn', 'emanates', 'lives', 'broken', 'qualities', 'helps', 'forth', 'proclaim', 'body']


Words per class with repitition (i.e. words said by Trump or Obama)

In [13]:
model.display_big_doc(10)

Obama ['My', 'fellow', 'citizens', ':', 'I', 'stand', 'here', 'today', 'humbled', 'by']
Trump ['Chief', 'Justice', 'Roberts', ',', 'President', 'Carter', ',', 'President', 'Clinton', ',']


Calculate the log likelihoods for each class (i.e. probability of word given it was said by Trump/Obama)

In [15]:
model.display_log_likelihood(10)

Obama [('inventive', -10.96108788999733), ('drawn', -10.96108788999733), ('emanates', -10.96108788999733), ('lives', -11.961087889997332), ('broken', -10.96108788999733), ('qualities', -10.96108788999733), ('helps', -10.96108788999733), ('forth', -10.376125389276176), ('proclaim', -10.96108788999733), ('body', -11.961087889997332)]
Trump [('inventive', -11.52845411076479), ('drawn', -11.52845411076479), ('emanates', -11.52845411076479), ('lives', -10.52845411076479), ('broken', -11.52845411076479), ('qualities', -11.52845411076479), ('helps', -11.52845411076479), ('forth', -11.52845411076479), ('proclaim', -11.52845411076479), ('body', -10.52845411076479)]


Initialize some test documents:

In [28]:
trump_test = [
        'We', ',', 'the', 'citizens', 'of', 'America', ',', 'are', 'now',
        'joined', 'in', 'a', 'great', 'national', 'effort', 'to',
        'rebuild', 'our', 'country', 'and', 'restore', 'its', 'promise',
        'for', 'all', 'of', 'our', 'people', '.'
        ]

obama_test = [
        'I', 'stand', 'here', 'today', 'humbled', 'by', 'the',
        'task', 'before', 'us', ',', 'grateful', 'for', 'the', 'trust',
        'you', 'have', 'bestowed', ',', 'mindful', 'of', 'the',
        'sacrifices', 'borne', 'by', 'our', 'ancestors', '.'
        ]


Ask the model to classify our test documents:

In [29]:
model.test_doc(trump_test)

'Trump'

In [31]:
model.test_doc(obama_test)

'Obama'

Interactive AI

In [50]:
input_string = input("Please input a string: ")

Please input a string: humbled


In [51]:
input_doc = tokenize(input_string)
model.test_doc(input_doc)

'Obama'