In [2]:
import nltk
from nltk.corpus import gutenberg
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import gutenberg




In [3]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:

KingJames = gutenberg.raw("bible-kjv.txt")

tokenizer = TreebankWordTokenizer()
#I still have trouble with the other tokenizer. I think I'll probably stick with this tokenizer for the year.


tokens = tokenizer.tokenize(KingJames)

tok_lower= [token.lower() for token in tokens]

# Filter out punctuation and stopwords we dont want them influencing the counts later on
stop_words = set(stopwords.words('english'))
filt_tok = [word for word in tok_lower if word.isalpha() and word not in stop_words]


In [5]:
# Bigram Finder This basically does a computation P(x,y)/(P(X)*P(y))
bigram_finder = BigramCollocationFinder.from_words(filt_tok)

# frequency filter wont consider any with less than 3
bigram_finder.apply_freq_filter(3)

# Raw Frequency Score This one is biagrams that occur most frequently
raw_freq_bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 20)  

# PMI Score This one uses the formula from earlier and goes from highest probaility of occuring together
# vs just the highest frequency biagram

pmi_bigrams = bigram_finder.nbest(BigramAssocMeasures.pmi, 20) 


In [6]:
print("Top 20 Bigrams raw frequency:")
for bigram in raw_freq_bigrams:
    print(bigram)

print("\nTop 20 bigrams by PMI:")
for bigram in pmi_bigrams:
    print(bigram)

Top 20 Bigrams raw frequency:
('said', 'unto')
('thou', 'shalt')
('lord', 'god')
('ye', 'shall')
('thou', 'hast')
('saith', 'lord')
('children', 'israel')
('unto', 'lord')
('came', 'pass')
('thus', 'saith')
('shall', 'come')
('unto', 'thee')
('say', 'unto')
('lord', 'thy')
('thy', 'god')
('lord', 'hath')
('thou', 'art')
('lord', 'shall')
('every', 'one')
('thee', 'thou')

Top 20 bigrams by PMI:
('halah', 'habor')
('hena', 'ivah')
('ikkesh', 'tekoite')
('alpha', 'omega')
('chancellor', 'shimshai')
('zophar', 'naamathite')
('bildad', 'shuhite')
('blasting', 'mildew')
('geshurites', 'maachathites')
('sepharvaim', 'hena')
('abishag', 'shunammite')
('chastised', 'whips')
('hammedatha', 'agagite')
('nepheg', 'japhia')
('sardius', 'topaz')
('ahinoam', 'jezreelitess')
('cornet', 'flute')
('doeg', 'edomite')
('grain', 'mustard')
('alabaster', 'box')


For the first part, the raw frequencies make sense. For example, phrases like "Children of Israel" occur frequently in the Bible, but "Children" does not always occur with "Israel." You might also see "Children of Abraham" or "Children of ..." in other contexts. However, PMI highlights the strength of association between specific words, giving us higher probabilities for words that are strongly connected, such as "blasting" (crops) and "mildew" (crops). These words often occur together in the context of a specific punishment, and they rarely appear alone or in unrelated contexts.

PMI provides insight into the likelihood of certain tokens (words) appearing together. This concept forms the foundation of generative language models (LLMs), which generate text based on the probabilities of word combinations. The model predicts the next token by evaluating the likelihood of it appearing alongside the previous tokens.