In [2]:
import nltk
from nltk import FreqDist, bigrams
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.corpus import gutenberg, stopwords
import string

# Download necessary corpora
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
kjv_text = gutenberg.raw('bible-kjv.txt')
caesar_text = gutenberg.raw('shakespeare-caesar.txt')

In [9]:


import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

# Preprocess function to tokenize using TreebankWordTokenizer, lowercase, and remove punctuation and stopwords
def preprocess(text):
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text.lower())  # Use Treebank tokenizer
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in tokens if word not in stop_words]
    return filtered_words

# Example: Assuming kjv_text and caesar_text are already loaded
kjv_words = preprocess(kjv_text)
caesar_words = preprocess(caesar_text)


In [10]:
# Frequency Distribution for both texts
kjv_freq = FreqDist(kjv_words)
caesar_freq = FreqDist(caesar_words)

# Top 50 words by frequency for both texts
print("Top 50 words in King James Bible by frequency:")
print(kjv_freq.most_common(50))

print("\nTop 50 words in Julius Caesar by frequency:")
print(caesar_freq.most_common(50))


Top 50 words in King James Bible by frequency:
[('shall', 9838), ('unto', 8995), ('lord', 7231), ('thou', 5466), ('thy', 4600), ('said', 3976), ('ye', 3974), ('god', 3905), ('thee', 3331), ('upon', 2745), ('man', 2635), ('king', 2403), ('son', 2305), ('hath', 2258), ('israel', 2232), ('came', 2087), ('people', 1947), ('one', 1939), ('come', 1897), ('house', 1863), ('children', 1780), ('also', 1715), ('shalt', 1616), ('land', 1578), ('men', 1539), ('day', 1537), ('let', 1511), ('go', 1446), ('saying', 1441), ('went', 1387), ('made', 1386), ('even', 1365), ('behold', 1323), ('hand', 1318), ('us', 1306), ('saith', 1256), ('therefore', 1237), ('every', 1236), ('things', 1084), ('hast', 1069), ('sons', 1064), ('make', 1054), ('say', 1049), ('father', 1036), ('may', 1027), ('david', 1002), ('great', 945), ('thine', 930), ('jesus', 924), ('among', 916)]

Top 50 words in Julius Caesar by frequency:
[('caesar', 177), ('brutus', 150), ('haue', 147), ('shall', 125), ('thou', 115), ('cassius', 79)

In [11]:


# Bigrams Frequency for both texts
kjv_bigrams = FreqDist(bigrams(kjv_words))
caesar_bigrams = FreqDist(bigrams(caesar_words))

# Top 50 bigrams by frequency for both texts
print("\nTop 50 bigrams in King James Bible by frequency:")
print(kjv_bigrams.most_common(50))

print("\nTop 50 bigrams in Julius Caesar by frequency:")
print(caesar_bigrams.most_common(50))

# Bigram Mutual Information for both texts
bigram_measures = BigramAssocMeasures()
kjv_finder = BigramCollocationFinder.from_words(kjv_words)
caesar_finder = BigramCollocationFinder.from_words(caesar_words)





Top 50 bigrams in King James Bible by frequency:
[(('said', 'unto'), 1697), (('thou', 'shalt'), 1250), (('lord', 'god'), 823), (('ye', 'shall'), 773), (('thou', 'hast'), 772), (('saith', 'lord'), 739), (('children', 'israel'), 581), (('unto', 'lord'), 573), (('came', 'pass'), 455), (('thus', 'saith'), 445), (('shall', 'come'), 434), (('unto', 'thee'), 424), (('say', 'unto'), 407), (('lord', 'thy'), 351), (('thy', 'god'), 332), (('lord', 'hath'), 331), (('thou', 'art'), 326), (('lord', 'shall'), 316), (('every', 'one'), 314), (('thee', 'thou'), 294), (('every', 'man'), 291), (('lord', 'said'), 284), (('spake', 'unto'), 279), (('shalt', 'thou'), 269), (('word', 'lord'), 259), (('came', 'unto'), 222), (('unto', 'moses'), 222), (('let', 'us'), 215), (('god', 'hath'), 214), (('answered', 'said'), 209), (('lord', 'hosts'), 208), (('unto', 'ye'), 203), (('son', 'man'), 193), (('shall', 'ye'), 192), (('house', 'lord'), 191), (('shall', 'go'), 188), (('god', 'israel'), 188), (('saith', 'unto')

In [12]:
# Apply frequency filter
kjv_finder.apply_freq_filter(5)
caesar_finder.apply_freq_filter(5)

# Top 50 bigrams by Mutual Information score for both texts
print("\nTop 50 bigrams in King James Bible by Mutual Information:")
print(kjv_finder.nbest(bigram_measures.pmi, 50))

print("\nTop 50 bigrams in Julius Caesar by Mutual Information:")
print(caesar_finder.nbest(bigram_measures.pmi, 50))


Top 50 bigrams in King James Bible by Mutual Information:
[('bildad', 'shuhite'), ('blasting', 'mildew'), ('abishag', 'shunammite'), ('ahinoam', 'jezreelitess'), ('grain', 'mustard'), ('fins', 'scales'), ('swarms', 'flies'), ('warp', 'woof'), ('cherethites', 'pelethites'), ('zorah', 'eshtaol'), ('untempered', 'morter'), ('engravings', 'signet'), ('cheweth', 'cud'), ('dathan', 'abiram'), ('flanks', 'caul'), ('zebah', 'zalmunna'), ('er', 'onan'), ('hushai', 'archite'), ('ruth', 'moabitess'), ('caul', 'liver'), ('eliphaz', 'temanite'), ('meshach', 'abednego'), ('principalities', 'powers'), ('shadrach', 'meshach'), ('cock', 'crow'), ('ahijah', 'shilonite'), ('menservants', 'maidservants'), ('filthy', 'lucre'), ('nadab', 'abihu'), ('openeth', 'matrix'), ('naboth', 'jezreelite'), ('barzillai', 'gileadite'), ('divideth', 'hoof'), ('reubenites', 'gadites'), ('hophni', 'phinehas'), ('badgers', 'skins'), ('tooth', 'tooth'), ('ill', 'favoured'), ('astonishment', 'hissing'), ('skins', 'dyed'), ('