## Start by importing nltk and glob

In [1]:
import nltk, glob

## Create two file name lists
* `glob.glob()` is used to create Bulgarian and Japanese essay file name lists. 

In [2]:
bfnames = glob.glob('corpus/B*.txt')
jfnames = glob.glob('corpus/J*.txt')

In [3]:
print(bfnames)
print(jfnames)

['corpus/BGSU1009.txt', 'corpus/BGSU1008.txt', 'corpus/BGSU1006.txt', 'corpus/BGSU1010.txt', 'corpus/BGSU1007.txt', 'corpus/BGSU1001.txt', 'corpus/BGSU1005.txt', 'corpus/BGSU1003.txt', 'corpus/BGSU1004.txt', 'corpus/BGSU1002.txt']
['corpus/JPSW1003.txt', 'corpus/JPSW1010.txt', 'corpus/JPSW1004.txt', 'corpus/JPSW1005.txt', 'corpus/JPSW1006.txt', 'corpus/JPSW1002.txt', 'corpus/JPSW1009.txt', 'corpus/JPSW1007.txt', 'corpus/JPSW1008.txt', 'corpus/JPSW1001.txt']


## Build a list of tokenized words, Bulgarian side

In [4]:
btokens = []
for f in bfnames:
    txt = open(f).read()
    tokens = nltk.word_tokenize(txt)
    btokens += tokens

In [5]:
len(btokens)

5800

In [6]:
btokens

['The',
 'world',
 'we',
 'live',
 'in',
 'is',
 'obsessed',
 'with',
 'the',
 'achievements',
 'of',
 'science',
 ',',
 'technology',
 'and',
 'industrialization',
 '.',
 'The',
 'modern',
 'civilization',
 'has',
 'created',
 'incommunicative',
 ',',
 'impatient',
 ',',
 'narrow-minded',
 'people',
 'that',
 'do',
 'not',
 'care',
 'for',
 'the',
 'history',
 'but',
 'only',
 'for',
 'their',
 'well-being',
 '.',
 'The',
 'fact',
 'is',
 'that',
 'we',
 'have',
 'ceased',
 'to',
 'be',
 'spiritual',
 'and',
 'simple',
 'beings',
 'and',
 'have',
 'turned',
 'into',
 'too',
 'practical',
 ',',
 'down-to-earth',
 'people',
 '.',
 'That',
 'is',
 'why',
 'there',
 'is',
 'no',
 'longer',
 'place',
 'for',
 'imagination',
 'and',
 'dreaming',
 'in',
 'our',
 'present',
 'life',
 '.',
 'According',
 'to',
 'me',
 ',',
 'the',
 'modern',
 'technologically',
 'dominated',
 'world',
 'has',
 'made',
 'us',
 'feel',
 'more',
 'isolated',
 'and',
 'more',
 'desperate',
 'because',
 'we',
 'hav

## Build a list of tokenized words, Japanese side
* YOUR TURN: Try it out! 

In [7]:
jtokens = []  

## QUESTION: comparing two token lists
* Which group writes longer essays -- Bulgarian or Japanese?

## Build word frequency dictionaries
* `nltk.FreqDist()` takes a list of tokenized words, returns a frquency dictionary

In [8]:
bfreq = nltk.FreqDist(btokens)
jfreq = nltk.FreqDist(jtokens)

#### Once a FreqDist (frequency distribution) object is built, you can easily explore it. 
* `FreqDist.most_common(20)`   --> returns top 20 samples + counts
* `FreqDist.N()` --> returns the total # of samples, i.e., word types
* `FreqDist['the']` --> raw frequency count of word 'the'
* `FreqDist.freq('the')` --> relativized frequency of word 'the'
* `FreqDist.hapaxes()` --> retusn a list of 'hapaxes', i.e., words that occur only once
* `dir(FreqDist)` --> directory-lists all available methods
* `help(FreqDist.N)` --> prints out detailed info on a method

In [9]:
bfreq.most_common(20)

[('.', 273),
 ('the', 264),
 ('to', 178),
 (',', 176),
 ('of', 171),
 ('and', 150),
 ('in', 109),
 ('is', 106),
 ('a', 92),
 ('that', 84),
 ('are', 75),
 ('with', 65),
 ('not', 65),
 ('they', 61),
 ('their', 59),
 ('for', 55),
 ('people', 40),
 ('have', 40),
 ('be', 37),
 ('it', 36)]

In [10]:
help(bfreq.B)

Help on method B in module nltk.probability:

B() method of nltk.probability.FreqDist instance
    Return the total number of sample values (or "bins") that
    have counts greater than zero.  For the total
    number of sample outcomes recorded, use ``FreqDist.N()``.
    (FreqDist.B() is the same as len(FreqDist).)
    
    :rtype: int



## Questions: comparing token frequencies
* Which group demonstrates the larger number of word types?
* What are the top words used by Bulgarian students? What are the top words by Japanese students? Any noticeable difference?
* Which group uses 'the' more frequently -- Bulgarian or Japanese? 
* How about 'a'?

In [11]:
print(bfreq.B())

1378


## Compute average word length, Bulgarian side
* Can be done using list comprehension! 
* A list of tokenized words can be transformed into a list of each word's length. 
* `sum()` function takes a list of numbers; returns the sum. 

In [12]:
btoken_lens = [len(w) for w in btokens] 

In [13]:
btokens

['The',
 'world',
 'we',
 'live',
 'in',
 'is',
 'obsessed',
 'with',
 'the',
 'achievements',
 'of',
 'science',
 ',',
 'technology',
 'and',
 'industrialization',
 '.',
 'The',
 'modern',
 'civilization',
 'has',
 'created',
 'incommunicative',
 ',',
 'impatient',
 ',',
 'narrow-minded',
 'people',
 'that',
 'do',
 'not',
 'care',
 'for',
 'the',
 'history',
 'but',
 'only',
 'for',
 'their',
 'well-being',
 '.',
 'The',
 'fact',
 'is',
 'that',
 'we',
 'have',
 'ceased',
 'to',
 'be',
 'spiritual',
 'and',
 'simple',
 'beings',
 'and',
 'have',
 'turned',
 'into',
 'too',
 'practical',
 ',',
 'down-to-earth',
 'people',
 '.',
 'That',
 'is',
 'why',
 'there',
 'is',
 'no',
 'longer',
 'place',
 'for',
 'imagination',
 'and',
 'dreaming',
 'in',
 'our',
 'present',
 'life',
 '.',
 'According',
 'to',
 'me',
 ',',
 'the',
 'modern',
 'technologically',
 'dominated',
 'world',
 'has',
 'made',
 'us',
 'feel',
 'more',
 'isolated',
 'and',
 'more',
 'desperate',
 'because',
 'we',
 'hav

In [14]:
btoken_lens

[3,
 5,
 2,
 4,
 2,
 2,
 8,
 4,
 3,
 12,
 2,
 7,
 1,
 10,
 3,
 17,
 1,
 3,
 6,
 12,
 3,
 7,
 15,
 1,
 9,
 1,
 13,
 6,
 4,
 2,
 3,
 4,
 3,
 3,
 7,
 3,
 4,
 3,
 5,
 10,
 1,
 3,
 4,
 2,
 4,
 2,
 4,
 6,
 2,
 2,
 9,
 3,
 6,
 6,
 3,
 4,
 6,
 4,
 3,
 9,
 1,
 13,
 6,
 1,
 4,
 2,
 3,
 5,
 2,
 2,
 6,
 5,
 3,
 11,
 3,
 8,
 2,
 3,
 7,
 4,
 1,
 9,
 2,
 2,
 1,
 3,
 6,
 15,
 9,
 5,
 3,
 4,
 2,
 4,
 4,
 8,
 3,
 4,
 9,
 7,
 2,
 4,
 4,
 3,
 5,
 10,
 4,
 3,
 4,
 3,
 3,
 7,
 4,
 3,
 6,
 4,
 8,
 2,
 1,
 3,
 13,
 3,
 2,
 2,
 3,
 3,
 2,
 10,
 2,
 3,
 4,
 2,
 4,
 6,
 8,
 4,
 5,
 1,
 10,
 3,
 11,
 1,
 2,
 1,
 3,
 7,
 1,
 10,
 3,
 8,
 2,
 7,
 6,
 5,
 2,
 3,
 3,
 1,
 8,
 4,
 9,
 6,
 3,
 10,
 5,
 1,
 6,
 2,
 6,
 4,
 3,
 7,
 3,
 12,
 5,
 7,
 2,
 3,
 14,
 5,
 4,
 2,
 3,
 4,
 3,
 4,
 3,
 6,
 2,
 5,
 5,
 4,
 4,
 1,
 4,
 3,
 4,
 6,
 3,
 9,
 2,
 3,
 3,
 8,
 2,
 2,
 2,
 6,
 4,
 3,
 5,
 13,
 12,
 2,
 4,
 7,
 1,
 2,
 3,
 4,
 3,
 4,
 2,
 4,
 6,
 2,
 10,
 1,
 5,
 6,
 7,
 5,
 8,
 2,
 4,
 13,
 3,
 2,
 7,
 5,
 12,
 4,
 3,
 6,


In [15]:
sum([1, 2, 3, 4, 5])

15

In [16]:
sum([1, 2, 3, 4, 5]) / 5

3.0

## Compute average word length, Japanese side
* YOUR TURN: try it out! 

In [17]:
jtoken_lens = [] 

## Question: comparing average word lengths
* Which group uses longer words on average?  

## Compute average sentence length, Bulgarian side
* How long are Bulgarian sentences in terms of \# of words?
* We will assume: every sentence ends with either '.', '?', '!'. 
* Then, the total count of '.', '?' and '!' is the total \# of sentences.
* Finally, the average sentence length is computed as the total \# of tokens divided by the total \# of sentences. 

In [18]:
bsent_count = bfreq['.'] + bfreq['!'] + bfreq['?']
print(bsent_count)

284


In [19]:
len(btokens) / bsent_count

20.422535211267604

## Compute average sentence length, Japanese side
* YOUR TURN: try it out!

In [20]:
jsent_count = 0

## Question: comparing average sentence lengths
* Which group writes longer sentences -- Bulgarian or Japanese? 

## Summary: Bulgarian vs. Japanese EFL writing
* How do the two groups compare in terms of:
> 1. Average essay length
> 2. Average word length
> 3. Average sentence length
* Any other noticeable traits?