In [2]:
import nltk
import os

In [4]:
corpus_files = sorted(os.listdir('corpora'))


In [5]:
def tokenise(corpus):
    return [s.split(' ') for s in corpus.split("\n")]

def get_unigrams(corpus):
    unigrams = {}
    for sentence in corpus:
        for word in sentence:
            word = word.lower()
            if word not in unigrams:
                unigrams[word] = 0
            unigrams[word] += 1
    return unigrams

def sort_dict(d, reverse = True):
    return sorted(d.items(), key = lambda x : x[1], reverse = reverse)

In [6]:
corpora = {f : tokenise(open('corpora/'+f).read()) for f in corpus_files}
all_unigrams = {author : get_unigrams(corpus) for author, corpus in corpora.items()}

for author, corpus in corpora.items():
    print(author, len(corpus), len(all_unigrams[author]))


twain.txt 12329 13838
tolstoy.txt 66056 40132
dickens.txt 20423 21142
doyle.txt 13053 14544
austen.txt 13428 13128


In [7]:
def merge_unigrams(unigrams):
    all_words = set([word for author in unigrams for word in unigrams[author]])
    for word in all_words:
        for author in unigrams:
            if word not in unigrams[author]:
                unigrams[author][word] = 0
    print("Total unique words", len(all_words))
    return all_words

In [8]:
from ipy_table import *
all_words = merge_unigrams(all_unigrams)
utable = [(word, [all_unigrams[author][word] for author in all_unigrams]) for word in all_words]
sorted_utable = sorted(utable, key = lambda x: sum(x[1]), reverse=True)
make_table(sorted_utable[:10])

Total unique words 66481


0,1
the,"[4862, 8239, 5704, 4479, 34258]"
and,"[6107, 6746, 2882, 3398, 21396]"
to,"[2911, 5063, 2719, 4169, 16500]"
,"[2736, 4456, 3146, 2777, 16769]"
of,"[1746, 4481, 2756, 3680, 14904]"
a,"[2949, 4022, 2648, 1982, 10388]"
in,"[1387, 2948, 1757, 1890, 8733]"
i,"[2476, 5674, 2533, 1740, 3226]"
he,"[1444, 2026, 1278, 1248, 9298]"
was,"[1942, 2691, 1370, 1797, 7200]"


In [9]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)
sorted_utable = sorted_utable[:500]
authors = list(all_unigrams.keys())
get_v_for_auth = lambda x : list(zip(*list(zip(*sorted_utable))[1]))[authors.index(x)]

iplot([ 
    Bar({"x" : list(zip(*sorted_utable))[0], "y": get_v_for_auth(author)}, name=author) for author in all_unigrams
])

In [10]:
def convert_to_probs(unigrams):
    uprobs = {}
    for author in unigrams:
        V = len(list(unigrams[author].keys()))
        N = sum(list(unigrams[author].values()))
        uprobs[author] = {}
        for word in unigrams[author]:
            #uprobs[author][word] = unigrams[author][word] / float(N)
            uprobs[author][word] = (unigrams[author][word] + 1)/ (float(N) + V)
    return uprobs
        

In [11]:
uprobs = convert_to_probs(all_unigrams)
uptable = [(word, [uprobs[author][word] for author in uprobs]) for word in all_words]
sorted_uptable = sorted(uptable, key = lambda x: sum(x[1]), reverse=True)
make_table(sorted_uptable[:10])

0,1
the,"[0.026510897652561684, 0.052741937222022944, 0.031893235073269285, 0.03220252878753669, 0.023110652566417332]"
and,"[0.03329807996336557, 0.03294081061150719, 0.026114521485357752, 0.016273425152404607, 0.017534175909208152]"
to,"[0.01587491958960716, 0.025403389068583453, 0.019600405632407242, 0.01535335290133213, 0.021511477946866135]"
,"[0.014920897979654808, 0.025817516191754715, 0.017250988922519566, 0.017763603522239782, 0.014330668042300748]"
of,"[0.009523861443352922, 0.022946337438169588, 0.017347752378445748, 0.015562203657710544, 0.018988908950219242]"
a,"[0.016082078567768245, 0.015993928188201535, 0.015571175327641061, 0.014952585233687062, 0.010229558937322672]"
i,"[0.013503494444868454, 0.004967985972021017, 0.02196530449524311, 0.0143034545044028, 0.008981171008511736]"
in,"[0.00756675425493638, 0.013446045701776129, 0.011414217261052321, 0.00992323323549334, 0.009754965179262316]"
was,"[0.010592365646499558, 0.011085982951510179, 0.01041948893413118, 0.0077387672160758635, 0.009275212793396957]"
he,"[0.007877492722178005, 0.014315866611039183, 0.007845581006494764, 0.007219462632648453, 0.006443126128449833]"


In [14]:
sorted_uptable = sorted_uptable[:500]
authors = list(all_unigrams.keys())
get_v_for_auth = lambda x : list(zip(*list(zip(*sorted_uptable))[1]))[authors.index(x)]

iplot([ 
    Bar({"x" : list(zip(*sorted_uptable))[0], "y": get_v_for_auth(author)}, name=author) for author in all_unigrams
])

In [15]:
def sentence_prob(sentence):
    probs = []
    for author in uprobs:
        p = 1.0
        for word in sentence:
            p *= uprobs[author][word]
        probs += [(author, p)]
    return probs

In [16]:
sentence = 'this is a bright day'.split(' ')
sprobs = sentence_prob(sentence)
print(sprobs)
iplot([ 
    Scatter({"x" : list(zip(*sprobs))[0], "y": list(zip(*sprobs))[1]})
])

iplot([ 
    Bar({"x" : sentence, "y": [uprobs[author][w] for w in sentence]}, name=author) for author in all_unigrams
])

[('twain.txt', 2.2651895590185623e-16), ('tolstoy.txt', 1.072347368624062e-14), ('dickens.txt', 4.255603163224632e-15), ('doyle.txt', 4.698404021611878e-15), ('austen.txt', 3.930048527783163e-16)]


## We have
$$ P(word\,|\,class) = \frac{c(word\,in\,class)}{c(all\,words\,in\,that\,class)}$$
## What we need
$$ P(class\,|\,word) $$
$$ \rightarrow \frac{P(word\,|\,class) * P(class)}{P(word)} $$
## How to choose a class ?
$$class = argmax_i\,P(class_i\,|\,word)$$
$$ \rightarrow argmax_i \, \frac{P(word\,|\,class_i) * P(class_i)}{P(word)} $$
$$ \rightarrow argmax_i \, P(word\,|\,class_i) * P(class_i) $$

In [17]:
def class_prob(corpora):
    probs = {}
    for author, corpus in corpora.items():
        probs[author] = sum([len(x) for x in corpus])
    M = sum(list(probs.values()))
    return {a : p/float(M) for a,p in probs.items()}

In [18]:
cprobs = class_prob(corpora)

In [19]:
sorted_cprobs = sort_dict(cprobs)
iplot([ 
    Scatter({"x" : list(zip(*sorted_cprobs))[0], "y": list(zip(*sorted_cprobs))[1]})
])


In [22]:
def nbestimate(sentence):
    probs = []
    sprob = 0
    for author in uprobs:
        p = 1.0
        for word in sentence:
            p *= uprobs[author][word] 
        sprob += p
        probs += [(author, p * cprobs[author])]
    return probs, sprob

In [27]:
nbprobs, sprob = nbestimate(sentence)

iplot([ 
    Bar({"x" : list(zip(*sprobs))[0], "y": list(zip(*sprobs))[1]}),
    Bar({"x" : list(zip(*nbprobs))[0], "y": [i for i in list(zip(*nbprobs))[1]]})
])
nbmax = max(nbprobs, key= lambda x : x[1])
smax = max(sprobs, key= lambda x : x[1])
print(nbmax, smax)

('tolstoy.txt', 5.533489318228795e-15) ('tolstoy.txt', 1.072347368624062e-14)


In [159]:
import random
nprobs = {}
for auth, c in corpora.items():
    clen = len(corpora[auth])
    rsentence = corpora[auth][random.randrange(clen)]
    rsentence = [w.lower() for w in rsentence]
    print(auth, rsentence)
    nbprobs = nbestimate(rsentence)
    nprobs[auth] = nbprobs
    iplot([ 
        Scatter({"x" : list(zip(*nbprobs))[0], "y": list(zip(*nbprobs))[1]}, name=auth) 
    ])
# iplot([ 
#     Bar({"x" : list(zip(*D))[0], "y": list(zip(*D))[1]}, name=auth) for auth, D in nprobs.items()
# ])

tolstoy.txt ['ask', 'forgiveness.']


doyle.txt ['"what!', 'a', 'murderous', 'attack?"']


twain.txt ['and', 'this', '‘n', '‘ll', 'come', 'out', 'all', 'right.', '\xa0what’s', 'the', 'matter', 'with', '‘em?”']


austen.txt ['their', 'journey', 'was', 'performed', 'without', 'much', 'conversation,', 'or', 'any', 'alarm;', 'and']


dickens.txt ['was', 'when', 'i', 'ascended', 'it.']


In [30]:
import numpy
import random

In [31]:
def generate():
    sentence_len = random.randrange(3, 15)
    class_counts = numpy.random.multinomial(sentence_len, list(cprobs.values()))
    word_classes = [i for i,x in enumerate(class_counts) for y in range(x)]
    random.shuffle(word_classes)
    print(class_counts, word_classes)
    for cls in word_classes:
        #print(cls, list(cprobs.keys()))
        cname = list(cprobs.keys())[cls - 1]        
        W = numpy.random.multinomial(10, list(uprobs[cname].values()))
        windex = numpy.argmax(W)
        wrd = list(uprobs[cname].keys())[windex]
        print("{wrd}/{cname} ".format(**locals()), end='')
    

In [32]:
generate()

[0 3 1 1 0] [3, 1, 2, 1, 1]
of/dickens.txt on/twain.txt dozen/tolstoy.txt come/twain.txt her/twain.txt 

In [None]:
#import os
import kenlm

model_dir = 'models/'
models = {m : kenlm.Model(model_dir + m) for m in os.listdir(model_dir) if m.endswith('.bin')}
starting_sentence = "this is a"

for author, model in models.items():
    print(author, '{0}-gram model'.format(model.order))
    sc = model.score(starting_sentence)
    #print(sc)
    desired_length = 20
    current_len = len(starting_sentence.split(' '))
    for i in range(desired_length - current_len):
        wds = list(all_unigrams[author[:-4] + '.txt'].keys())
        probs = []
        for word in wds:
            candidate_sentence = starting_sentence + " " + word
            cs = model.score(candidate_sentence)
            probs.append(cs)
        idx = numpy.argmax(probs)
        print(idx)
