## Start by importing nltk and glob

In [4]:
import nltk, glob

## Creating two file name lists
* `glob.glob()` is used to create Bulgarian and Japanese essay file name lists. 

In [8]:
bfnames = glob.glob('corpus/B*.txt')
jfnames = glob.glob('corpus/J*.txt')

In [10]:
print(bfnames)
print(jfnames)

['corpus/BGSU1009.txt', 'corpus/BGSU1008.txt', 'corpus/BGSU1006.txt', 'corpus/BGSU1010.txt', 'corpus/BGSU1007.txt', 'corpus/BGSU1001.txt', 'corpus/BGSU1005.txt', 'corpus/BGSU1003.txt', 'corpus/BGSU1004.txt', 'corpus/BGSU1002.txt']
['corpus/JPSW1003.txt', 'corpus/JPSW1010.txt', 'corpus/JPSW1004.txt', 'corpus/JPSW1005.txt', 'corpus/JPSW1006.txt', 'corpus/JPSW1002.txt', 'corpus/JPSW1009.txt', 'corpus/JPSW1007.txt', 'corpus/JPSW1008.txt', 'corpus/JPSW1001.txt']


## Build a list of tokenized words, Bulgarian side

In [13]:
btokens = []
for f in bfnames:
    txt = open(f).read()
    tokens = nltk.word_tokenize(txt)
    btokens += tokens

In [15]:
len(btokens)

5800

In [16]:
btokens

['The',
 'world',
 'we',
 'live',
 'in',
 'is',
 'obsessed',
 'with',
 'the',
 'achievements',
 'of',
 'science',
 ',',
 'technology',
 'and',
 'industrialization',
 '.',
 'The',
 'modern',
 'civilization',
 'has',
 'created',
 'incommunicative',
 ',',
 'impatient',
 ',',
 'narrow-minded',
 'people',
 'that',
 'do',
 'not',
 'care',
 'for',
 'the',
 'history',
 'but',
 'only',
 'for',
 'their',
 'well-being',
 '.',
 'The',
 'fact',
 'is',
 'that',
 'we',
 'have',
 'ceased',
 'to',
 'be',
 'spiritual',
 'and',
 'simple',
 'beings',
 'and',
 'have',
 'turned',
 'into',
 'too',
 'practical',
 ',',
 'down-to-earth',
 'people',
 '.',
 'That',
 'is',
 'why',
 'there',
 'is',
 'no',
 'longer',
 'place',
 'for',
 'imagination',
 'and',
 'dreaming',
 'in',
 'our',
 'present',
 'life',
 '.',
 'According',
 'to',
 'me',
 ',',
 'the',
 'modern',
 'technologically',
 'dominated',
 'world',
 'has',
 'made',
 'us',
 'feel',
 'more',
 'isolated',
 'and',
 'more',
 'desperate',
 'because',
 'we',
 'hav

## Build a list of tokenized words, Japanese side

In [18]:
jtokens = []
for f in jfnames:
    txt = open(f).read()
    tokens = nltk.word_tokenize(txt)
    jtokens += tokens

In [20]:
len(jtokens)

4979

In [21]:
jtokens

['I',
 'think',
 'to',
 'master',
 'English',
 'as',
 'a',
 'second',
 'language',
 'is',
 'important',
 'for',
 'us',
 '.',
 'When',
 'I',
 'was',
 'child',
 'or',
 'junior',
 'high',
 'school',
 'student',
 ',',
 'I',
 'thought',
 'people',
 'who',
 'can',
 'speak',
 'English',
 'were',
 'wise',
 ',',
 'special',
 'people',
 '.',
 'but',
 'the',
 'society',
 'go',
 'today',
 'is',
 'running',
 'internationalization',
 '.',
 'To',
 'master',
 'English',
 'is',
 'common',
 ',',
 'English',
 'is',
 ',',
 'as',
 'it',
 'were',
 'common',
 'language',
 'in',
 'the',
 'world',
 '.',
 'There',
 'is',
 'a',
 'TV',
 'commercial',
 'of',
 'English',
 'conversation',
 'school',
 '.',
 'In',
 'the',
 'commercial',
 'an',
 'actors',
 'says',
 ',',
 '``',
 'if',
 'you',
 'can',
 'speak',
 'English',
 ',',
 'you',
 'can',
 'talk',
 'with',
 'a',
 'billion',
 'people',
 '.',
 "''",
 'the',
 'words',
 'moved',
 'me',
 '.',
 'Yes',
 ',',
 'that',
 "'s",
 'right',
 '!',
 'In',
 'Japan',
 ',',
 'Japanes

## Build word frequency dictionaries

In [23]:
bfreq = nltk.FreqDist(btokens)

* Once a FreqDist (frequency distribution) object is built, you can easily explore it. 
* `FreqDist.most_common(20)`   --> prints out top 20 samples + counts
* `FreqDist.N()` --> prints out total # of

In [24]:
bfreq.most_common(20)

[('.', 273),
 ('the', 264),
 ('to', 178),
 (',', 176),
 ('of', 171),
 ('and', 150),
 ('in', 109),
 ('is', 106),
 ('a', 92),
 ('that', 84),
 ('are', 75),
 ('not', 65),
 ('with', 65),
 ('they', 61),
 ('their', 59),
 ('for', 55),
 ('people', 40),
 ('have', 40),
 ('be', 37),
 ('it', 36)]

In [26]:
jfreq = nltk.FreqDist(jtokens)

In [28]:
jfreq.most_common(20)

[('.', 334),
 ('English', 259),
 (',', 228),
 ('to', 174),
 ('is', 134),
 ('I', 130),
 ('the', 112),
 ('we', 80),
 ('a', 79),
 ('and', 78),
 ('in', 69),
 ('Japanese', 67),
 ('of', 63),
 ('language', 56),
 ('people', 56),
 ('that', 56),
 ('master', 53),
 ("n't", 52),
 ('think', 50),
 ('be', 48)]

In [29]:
dir(jfreq)

['B',
 'N',
 'Nr',
 '__add__',
 '__and__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__init__',
 '__ior__',
 '__isub__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__missing__',
 '__module__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_cumulative_frequencies',
 '_keep_positive',
 'clear',
 'copy',
 'elements',
 'freq',
 'fromkeys',
 'get',
 'hapaxes',
 'items',
 'keys',
 'max',
 'most_common',
 'pformat',
 'plot',
 'pop',
 'popitem',
 'pprint',
 'r_Nr',
 'setdefault',
 'subtract',
 'tabulate',
 'unicode_repr',
 'update',
 'values']

In [37]:
help(jfreq.B)

Help on method B in module nltk.probability:

B() method of nltk.probability.FreqDist instance
    Return the total number of sample values (or "bins") that
    have counts greater than zero.  For the total
    number of sample outcomes recorded, use ``FreqDist.N()``.
    (FreqDist.B() is the same as len(FreqDist).)
    
    :rtype: int

