In [None]:
from htrc_features import FeatureReader
import glob
from nltk import word_tokenize
import pandas as pd

In [None]:
paths = glob.glob('../data/classification/*bz2')
fr = FeatureReader(paths)

for vol in fr.volumes():
    print(vol.id, '\t', vol.language, '\t', vol.handle_url, '\t', vol.title[:40])

hvd.32044014292023 	 eng 	 http://hdl.handle.net/2027/hvd.32044014292023 	 Alice's adventures in Wonderland ; and, 
hvd.32044102860673 	 fre 	 http://hdl.handle.net/2027/hvd.32044102860673 	 Notre Dame de Paris. Abridged and edited
mdp.39015038910694 	 eng 	 http://hdl.handle.net/2027/mdp.39015038910694 	 Moby Dick,
pst.000029579440 	 eng 	 http://hdl.handle.net/2027/pst.000029579440 	 The adventures of Huckleberry Finn / by 
uiug.30112037882914 	 fre 	 http://hdl.handle.net/2027/uiug.30112037882914 	 Candide ou L'optimisme.
wu.89104415476 	 fre 	 http://hdl.handle.net/2027/wu.89104415476 	 Les liaisons dangereuses / Choderlos de 


Collect the token counts for french and english separately.

In [None]:
tl = vol.tokenlist(pages=False, pos=False).head().reset_index()
tl['language'] = vol.language
tl[['token', 'count', 'language']].head()

Unnamed: 0,token,count,language
0,!,573,fre
1,!..i,1,fre
2,!je,1,fre
3,"""",12,fre
4,"""de",1,fre


In [None]:
book_dfs = []
classes_count = {'eng': 0, 'fre': 0}

for vol in fr.volumes():
    tl = vol.tokenlist(pages=False, pos=False, case=False).reset_index()
    classes_count[vol.language] += 1
    tl['language'] = vol.language
    book_dfs.append(tl[['lowercase', 'count', 'language']])

In [None]:
corpus = (pd.concat(book_dfs)
            .groupby(by=['language', 'lowercase']).sum()
          )

In [None]:
# P(c)
p_c = pd.Series(classes_count) / len(paths)
p_c

eng    0.5
fre    0.5
dtype: float64

Next, we want to sum up the counts for the entire class, so each language x word only has one, total sum:

In [None]:
corpus['P(w|c)'] = corpus.groupby(level='language').transform(lambda word: word / word.sum())['count']
corpus.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,P(w|c)
language,lowercase,Unnamed: 2_level_1,Unnamed: 3_level_1
eng,!,2230,0.00657
eng,!',1,3e-06
eng,!1,1,3e-06
eng,!33,1,3e-06
eng,!«lm,1,3e-06


In [None]:
corpus.loc[('eng')].sort_values('count', ascending=False).head(20)

Unnamed: 0_level_0,count,P(w|c)
lowercase,Unnamed: 1_level_1,Unnamed: 2_level_1
",",22737,0.066985
the,15962,0.047025
and,11356,0.033456
.,10732,0.031617
"""",7446,0.021936
to,6611,0.019476
a,6529,0.019235
of,5857,0.017255
i,5581,0.016442
it,5077,0.014957


For estimating P(w|c), divide each per-class count by the total words in that class.

In [None]:
string_to_classify = "bonjour"
relevant_tokens = word_tokenize(string_to_classify.lower())
relevant_tokens

['bonjour']

In [None]:
classified = (corpus.loc[(slice(None), relevant_tokens),]
                    .groupby(level='language')['P(w|c)'].prod()
            )
classified

language
eng    7.588625e-19
fre    1.814643e-22
Name: P(w|c), dtype: float64

Now, though it doesn't matter when the same classes were seen equally, remember to multiple by P(c):

In [None]:
classified * p_c

language
eng    3.794313e-19
fre    9.073217e-23
dtype: float64

Sort that, to make it more clear:

In [None]:
(classified * p_c).sort_values(ascending=False)

language
eng    3.794313e-19
fre    9.073217e-23
dtype: float64