# 9. Diction



In [1]:
import os

corpus1 = os.listdir('Corpus')
corpus2 = os.listdir('Contemporary_authors')

corpus1 = [file for file in corpus1 if file != '.DS_Store']
corpus2 = [file for file in corpus2 if file != '.DS_Store']

# Calculating frequencies

The code in the cell below reads in the full text of the files that are listed in `corpus1`. In this case, we are dealing with one text file only. Next, we calculate the frequencies of all of these words. These frequencies are stored in a dictionary named `freq1`.

Once the first subcorpus has been processed, the code does the same for the texts in `corpus2`. The word frequencies are placed in a dictionary named `freq2`.

After running this code, the variable `full_text1` will contain the *complete* texts of all the texts in `corpus1`. The dictionary named `freq1` will contain the frequencies of all the words in this full text. 

The variables `full_text2` and `freq2` store the same type of information for the texts in `corpus2`.

In [2]:
from tdmh import *
from os.path import join
from nltk import word_tokenize

from nltk.corpus import stopwords

stopwords = stopwords.words('english')

dir1 = 'Corpus'
dir2 = 'Contemporary_authors'


def tokenise_remove_stopwords(full_text):
    words = word_tokenize(full_text)
    new_list= []
    for w in words:
        w = w.lower().strip()
        orig = ''
        if w.isalnum() and w not in stopwords:
            new_list.append( w )
    return new_list


freq1 = dict()
full_text1 = ''

for text in corpus1:
    print('Reading ' + text + ' ... ')
    with open( join( dir1,text) ) as file_handler:
        full_text1 += file_handler.read() + ' '

words = tokenise_remove_stopwords( full_text1  )

for w in words:
    freq1[w] = freq1.get(w,0) +1
    
       
freq2 = dict()
full_text2 = ''
    
for text in corpus2:
    print('Reading ' + text + ' ... ')
    with open( join( dir2,text) ) as file_handler:
        full_text2 += file_handler.read() + ' '

words = tokenise_remove_stopwords(  full_text2 )

for w in words:
    freq2[w] = freq2.get(w,0) +1
    

Reading Mansfield_Park.txt ... 
Reading Sense_and_Sensibility.txt ... 
Reading Northanger_Abbey.txt ... 
Reading Persuasion.txt ... 
Reading Emma.txt ... 
Reading Pride_and_Prejudice.txt ... 
Reading 6593-0.txt ... 
Reading 3724-0.txt ... 
Reading 3409-8.txt ... 
Reading 27712.txt ... 
Reading 9539-readme.txt ... 
Reading 3719-0.txt ... 
Reading 12958.txt ... 
Reading 5826-8.txt ... 
Reading 15034-8.txt ... 
Reading 40619-8.txt ... 
Reading 3700-0.txt ... 
Reading 9525-readme.txt ... 
Reading 1147-0.txt ... 
Reading 19500-8.txt ... 
Reading 36256-8.txt ... 
Reading 7265-0.txt ... 
Reading 9534-readme.txt ... 
Reading 55212-0.txt ... 
Reading 5196-0.txt ... 
Reading 1865.txt ... 
Reading 9528-readme.txt ... 
Reading 8897.txt ... 
Reading 11936.txt ... 
Reading 619-8.txt ... 
Reading 58249-0.txt ... 
Reading 18645.txt ... 
Reading 12234.txt ... 
Reading 9531-readme.txt ... 
Reading 29000-8.txt ... 
Reading 2158.txt ... 
Reading 3767-0.txt ... 
Reading 27067.txt ... 
Reading 24103-8.txt .

In [3]:
print('Done!')

Done!



##  Dunning's log likelihood

One of statistical methods that can be used to find such distinctive words is *Dunning's log likelihood*. In short, it analyses the distinctiveness of word in one set of texts compared to the texts in a reference corpus, by calculating probabilities based on word frequencies. A good explanation of the fomula can be found on the [wordHoard](https://wordhoard.northwestern.edu/userman/analysis-comparewords.html#loglike) website. 

Using the frequencies that have been calculated above, the Dunning log likelihood scores are calculated for all of the words that occur both in `corpus1` and `corpus2` in the cell below. The actual calculation takes place in a method named `log_likelihood()`. The scores that are calculated are all stored in a dictionary named `ll_scores`

The formula that is implemented in the `log_likelihood` method returns a number which can either be positive or negative. A postive score indicates that there is a high probability that the word will be used in the first corpus. A negative probability indicates that occurence of the word is more common in the second corpus. The tokens that are assigned the highest scores, in other words, are also most distincive of the first corpus. 

The code below lists the 25 words that are assigned a positive log likelihood score in the first corpus. 

In [9]:
import math

def log_likelihood( word_count1 , word_count2, total1 , total_2 ):

    a = word_count1
    b = word_count2
    c = total1
    d = total2
 
    perc1 = (a/c)*100
    perc2 = (b/d)*100
    polarity = perc1 - perc2
 
    E1 = c*(a+b)/(c+d)
    E2 = d*(a+b)/(c+d)
    
    ln1 = math.log(a/E1)
    ln2 = math.log(b/E2)
    G2 = 2*((a* ln1) + (b* ln2))
    
    #if polarity < 0:
    #    G2 = -G2
    if a * math.log(a / E1) < 0:
        G2 = -G2

    return G2



ll_scores = dict()

total1 = 0
total2 = 0

for word1 in freq1:
    total1 += freq1[word1]
for word2 in freq2:
    total2 += freq2[word2]

for word in freq1:
    if word in freq2:

        ll_score = log_likelihood( freq1[word] , freq2[word] , total1 , total2 )
        ll_scores[word] = ll_score

max = 100
i = 0 
        
for word in sortedByValue(ll_scores , ascending = False ):
    print( word , ll_scores[word] )
    i += 1
    if i == max: 
        break        

emma 4134.042121534741
crawford 3119.821241648213
marianne 2768.615054174358
elizabeth 2377.4497501492524
harriet 2251.1263099370685
catherine 2215.965992593432
elinor 2105.956509392492
anne 2058.618311234011
jane 2024.4417685894775
elton 1978.9507151852765
could 1941.0539434461912
weston 1922.9396082615553
edmund 1885.4112711323166
fanny 1685.9947335049712
woodhouse 1565.1085790505601
bertram 1379.91562154255
elliot 1365.2143203398275
bingley 1200.3737595082744
jennings 1151.4434134133244
tilney 1114.8439131072666
fairfax 1104.9407725940644
norris 983.2512245559708
wickham 957.6162553281514
sister 919.049837553523
bennet 886.7744885919558
must 878.6634980331087
mansfield 855.6334922815815
feelings 840.5327587472831
thomas 821.2384005075502
edward 814.4726521507177
willoughby 811.9225731382959
isabella 800.6203522768513
churchill 800.039603634581
collins 749.1563067824953
morland 748.5563347843636
thorpe 715.9695143861669
soon 713.4528647798059
brandon 691.4021327625734
lydia 677.64934

Words with negative log likelihood scores are more likely to appear in the reference corpus (i.e. the second corpus) than in the first corpus. 

The code below lists the 25 words with the highest negative scores. 

In [10]:
max = 100
i = 0 

for word in sortedByValue(ll_scores ) :
    print( word , ll_scores[word] )
    i += 1
    if i == max:
        break   

lord -698.1556430971129
upon -542.5912241396121
pen -491.6974240497365
thou -473.3400254776525
says -469.7952064197819
old -455.70473376902623
king -431.3928879484596
cries -361.3240279370904
cecilia -352.92091719867125
thy -319.0388343844559
doctor -314.84138013692973
amelia -306.0648701810853
madame -303.12631218735066
harry -295.58331812878777
george -288.828522844759
queen -286.1196607849365
camilla -283.70029070455854
jones -276.8773515145193
de -270.200649245787
prince -269.40795848802827
french -260.29246394239703
madam -257.63997386971243
arthur -253.8320525385486
gentleman -202.3464406613511
men -200.5650638737169
reader -194.2371291469019
boy -192.6068291360287
english -185.9888479649926
man -179.0260632696793
master -177.35092028224742
duke -161.24449219029145
money -160.65950961569075
ellis -156.8358490018687
fellow -154.99964303940024
god -151.45244034936212
answered -150.779083252045
life -146.3837399841712
art -146.2936672967848
scarce -145.43715355262813
arms -138.93770

## Mann Whitney formula

In a [blogpost on identifying literary diction](https://tedunderwood.com/2011/11/09/identifying-the-terms-that-characterize-an-author-or-genre-why-dunnings-may-not-be-the-best-method/), Ted Underwood argues that Dunning's log likelihood function also has a number of disadvantages. It is sensitive to outliers, for example. He explains that the Mann Whitney ranks test can be a good alternative. 

To perform the Mann-Whitney ranks test, we firstly need to find all the words the two corpora to be compared have in common. Next, we need to divide the full texts of the two corpora to be compared into smaller chuncks, all of the same size. These can be fragments of 500 words, for instance. Next, we need to count the number of times each word occurs in these chunks. Using these counts, we can determine whether the word is more frequent in corpus 1 than in corpus 2 (or vice versa). As a final step, we determine the total number of fragments in which the word is most frequent, both in the first and the second corpus. If it is found, using these steps, that a word is much more common in one of the two corpora, this word can be identified as a distinctive word. The Mann-Whitney ranks test really looks at occurrences across the whole corpus, and it is neutralises the effect of exceptionally high counts in one or two of these chunks.      

The Mann Whitney test can be performed in Python using the `mannwhitneyu()` method from the `scipy.stats` module. 

In [11]:
from scipy.stats import mannwhitneyu

## make a list of all the words in both corpora
words1 = tokenise_remove_stopwords(full_text1)
words2 = tokenise_remove_stopwords(full_text2)

def divide_into_chunks(words, length):

    chunks=[]
    ## chunk contains dictionaries
    # with word frequencies
    
    for i in range(0, len(words), length):
        counts = dict()
        for j in range(length):
            if i+j < len(words):
                word = words[i+j]
                counts[word] = counts.get(word,0)+1
        chunks.append(counts)
    return chunks


length = 500
chunks1 = divide_into_chunks(words1,length)
chunks2 = divide_into_chunks(words2,length)


# vocab is the union of terms in both sets
all_words = dict()
    
for chunk in chunks1:
    for word in chunk:
        all_words[word]= all_words.get(word,0) + 1
for chunk in chunks2:
    for word in chunk:
        all_words[word]= all_words.get(word,0) + 1
    
rho =  dict()
    
for word in all_words:
        
    a=[]
    b=[]
        
    for chunk in chunks1:
        a.append(chunk.get(word,0))
    for chunk in chunks2:
        b.append(chunk.get(word,0))

    stat,pval=mannwhitneyu(a,b, alternative="two-sided")
    mean =len(chunks1)*len(chunks2)*0.5
    if stat <= mean:
        pval = 0 - pval
            
    rho[word]= ( pval )


The words that are most distinctive in corpus 1 have a negative value.

In [16]:
print( "The following words are most distinctive in corpus 1:" )  

i = 0
max = 500

for word in sortedByValue( rho ):
    if rho[word] > 0:
        print( f'{word}\t{rho[word]:.22f}' ) 
        i += 1
        if i == max:
            break

The following words are most distinctive in corpus 1:
woodhouse	0.0000000000000000000000
knightley	0.0000000000000000000000
bertram	0.0000000000000000000000
dashwood	0.0000000000000000000000
marianne	0.0000000000000000000000
darcy	0.0000000000000000000000
edmund	0.0000000000000000000000
elton	0.0000000000000000000000
mansfield	0.0000000000000000000000
hartfield	0.0000000000000000000000
weston	0.0000000000000000000000
jane	0.0000000000000000000000
feelings	0.0000000000000000000000
highbury	0.0000000000000000000000
fairfax	0.0000000000000000000000
could	0.0000000000000000000000
harriet	0.0000000000000000000000
rushworth	0.0000000000000000000000
norris	0.0000000000000000000000
jennings	0.0000000000000000000000
tilney	0.0000000000000000000000
catherine	0.0000000000000000000000
barton	0.0000000000000000000000
wickham	0.0000000000000000000000
longbourn	0.0000000000000000000000
isabella	0.0000000000000000000000
brandon	0.0000000000000000000000
wentworth	0.0000000000000000000000
sister	0.00000

The words that are most distinctive in corpus 2 have a negative value. 

In [17]:
print( "The following words are most distinctive in corpus 2:"  )  

i = 0
max = 500

for word in sortedByValue( rho , ascending = False ) :
    if rho[word] < 0:
        print( f'{word}: {rho[word]:.22f}' ) 
        i += 1
        if i == max:
            break

The following words are most distinctive in corpus 2:
upon: -0.0000000000000000000000
lord: -0.0000000000000000000000
old: -0.0000000000000000000000
says: -0.0000000000000000000000
ca: -0.0000000000000000000000
men: -0.0000000000000000000001
hath: -0.0000000000000000000006
reader: -0.0000000000000000000007
king: -0.0000000000000000000012
french: -0.0000000000000000000022
gentleman: -0.0000000000000000000207
cries: -0.0000000000000000000629
english: -0.0000000000000000001106
arms: -0.0000000000000000001969
master: -0.0000000000000000002874
fellow: -0.0000000000000000005172
man: -0.0000000000000000005436
life: -0.0000000000000000013460
boy: -0.0000000000000000017363
order: -0.0000000000000000017621
thou: -0.0000000000000000045887
scarce: -0.0000000000000000072362
thus: -0.0000000000000000115488
honest: -0.0000000000000000157644
wo: -0.0000000000000001474998
hand: -0.0000000000000002253913
art: -0.0000000000000003638055
royal: -0.0000000000000006033914
thee: -0.0000000000000008099392
heav

## Bibliography

* Dunning, Ted, 'Accurate Methods for the Statistics of Surprise and Coincidence', in *Computational Linguistics*, 19:1 (1993).
* Rayson, P. and Garside, R., 'Comparing corpora using frequency profiling', in *Proceedings of the workshop on Comparing Corpora, held in conjunction with the 38th annual meeting of the Association for Computational Linguistics (ACL 2000)* (2000)
* H. Mann and D. Whitney, 'On a Test of Whether one of Two Random Variables is Stochastically Larger than the Other', in *Ann. Math. Statist.*, 1:18 (1947). <https://doi.org/10.1214/aoms/1177730491>
* Adam Kilgarriff, *Comparing Corpora*, in *International Journal of Corpus Linguistics*, 6:1 (2001). <https://doi.org/10.1075/ijcl.6.1.05kil>

# Exercises

## Exercise 9.1

Can you compare the diction of *Pride and Prejudice* using the Mann Whitney formula?

In [None]:
dir = 'Corpus'

corpus1 = [ 'PrideandPrejudice.txt' ]
corpus2 = [ 'Ulysses.txt' ]


def tokenise_remove_stopwords(full_text):
    words = word_tokenize(full_text)
    new_list= []
    for w in words:
        w = w.lower().strip()
        orig = ''
        if w.isalnum() and w not in stopwords:
            new_list.append( w )
    return new_list


full_text1 = ''
full_text2 = ''

for text in corpus1:
    print('Reading ' + text + ' ... ')
    with open( join( dir,text) ) as file_handler:
        full_text1 += file_handler.read() + ' '

for text in corpus2:
    print('Reading ' + text + ' ... ')
    with open( join( dir,text) ) as file_handler:
        full_text2 += file_handler.read() + ' '

from scipy.stats import mannwhitneyu

## make a list of all the words in both corpora
words1 = tokenise_remove_stopwords(full_text1)
words2 = tokenise_remove_stopwords(full_text2)

def divide_into_chunks(words, length):

    chunks=[]
    ## chunk contains dictionaries
    # with word frequencies
    
    for i in range(0, len(words), length):
        counts = dict()
        for j in range(length):
            if i+j < len(words):
                word = words[i+j]
                counts[word] = counts.get(word,0)+1
        chunks.append(counts)
    return chunks


length = 500
chunks1 = divide_into_chunks(words1,length)
chunks2 = divide_into_chunks(words2,length)


# vocab is the union of terms in both sets
all_words = dict()
    
for chunk in chunks1:
    for word in chunk:
        all_words[word]= all_words.get(word,0) + 1
for chunk in chunks2:
    for word in chunk:
        all_words[word]= all_words.get(word,0) + 1
    
rho =  dict()
    
for word in all_words:
        
    a=[]
    b=[]
        
    for chunk in chunks1:
        a.append(chunk.get(word,0))
    for chunk in chunks2:
        b.append(chunk.get(word,0))

    stat,pval=mannwhitneyu(a,b, alternative="two-sided")
    mean =len(chunks1)*len(chunks2)*0.5
    if stat <= mean:
        pval = 0 - pval
            
    rho[word]= ( pval )
    
print( f"\nThe following words are most distinctive in {corpus1}" )  

i = 0
max = 25

for word in sortedByValue( rho ):
    if rho[word] > 0:
        print( f'{word}' ) 
        i += 1
        if i == max:
            break
            

print( f"\nThe following words are most distinctive in {corpus2}" )  

i = 0
max = 25

for word in sortedByValue( rho , ascending = False ) :
    if rho[word] < 0:
        print( f'{word}' ) 
        i += 1
        if i == max:
            break