In [5]:
import nltk
import re
from nltk.collocations import *
from nltk.stem import PorterStemmer
from nltk import word_tokenize

## Subcorpus 1: The complete works of William Shakespeare

The text is downloaded in .txt format from archive.org: 
https://archive.org/details/completeworksofw00shakrich

The file is truncated to include only the first 9,848 lines because we have slow laptops. :-(

In [6]:
# Grab the text
shakespeareFile = open("data/literature-shakespeare-trunc.txt", "r")
shakespeareText = shakespeareFile.read()
shakespeareTokenized = word_tokenize(shakespeareText)

1. The length (in words).

In [7]:
shakespeareTokens = len(shakespeareTokenized)
print(shakespeareTokens)

49495


2. The lexical diversity.

In [8]:
shakespeareTypes = len(set(shakespeareTokenized))
shakespeareLexDiversity = shakespeareTypes / shakespeareTokens
print(shakespeareLexDiversity)

0.13181129407010808




3. The longest sentence (type the sentence and also give the number of words). Hint: look at the Gutenberg part of Section 2.1 in NLTK.


In [9]:
shakespeareSentences = nltk.sent_tokenize(shakespeareText)
word_count = lambda sentence: len(word_tokenize(sentence))
print("The longest 'sentence' is:\n\n")
shakespeareLongestSentence = max(shakespeareSentences, key=word_count)
print(shakespeareLongestSentence)
print("\n\n")
print(f"The sentence has {word_count(shakespeareLongestSentence)} words.")


The longest 'sentence' is:


Pro, Ye elves of hills, brooks, standing lakes, 
and groves; 

And ye that on, the sands with printless foot 
Do chase the ebbing Neptune, and do fly him 
When he comes back; you demi-puppets that 
By moonshine do the green sour ringlets make, 
Whereoftkeewenotbites;andyouwhosepastime 
Is to make midnight mushrooms, that rejoice 
To hear the solemn curfew; by whose aid, — 
Weak masters thou^ ye be,— I haVe bedimm’d 
Thenoontidesun,call’dforththemutinous winds, 
And ’twixt the green sea and the azured vault 
Set roaring war: to the dread rattling thunder 
Have I given fire, and rifted Jove’s stout oak 
With his own bolt: the strong-based promontory 
Have I made shake: and by the spurs pluck’d up 
The pine and cedar: graves, at my command, 
Have waked their sleepers, oped, and let them 
forth 

By my so potent art.



The sentence has 173 words.



4. The top collocations.

In [10]:
shakespeareNLTKText = nltk.Text(shakespeareTokenized)
shakespeareNLTKText.collocations(num=20)

TWO GENTLEMEN; thou art; Sir Proteus; thou hast; Sir Thurio; thou
canst; Scene I.—; Sir Valentine; Sir John; Master Page; thou beest;
widow Dido; Thou liest; Wilt thou; pray thee; PERSONS REPRESENTED;
Pro- teus; John Falstaff; Thou hast; Dost thou


5. The top ten words that start with each of the vowels (involves using FreqDist).

In [11]:
shakespeareFreqDist = nltk.probability.FreqDist(shakespeareNLTKText)

def startsWithA(word):
    return re.search(f"^[Aa]", word) != None

shakespeareTopTenWordsWithA = list(filter(startsWithA, shakespeareFreqDist))[:10]

def startsWithE(word):
    return re.search(f"^[Ee]", word) != None

shakespeareTopTenWordsWithE = list(filter(startsWithE, shakespeareFreqDist))[:10]

def startsWithI(word):
    return re.search(f"^[Ii]", word) != None

shakespeareTopTenWordsWithI = list(filter(startsWithI, shakespeareFreqDist))[:10]

def startsWithO(word):
    return re.search(f"^[Oo]", word) != None

shakespeareTopTenWordsWithO = list(filter(startsWithO, shakespeareFreqDist))[:10]

def startsWithU(word):
    return re.search(f"^[Uu]", word) != None

shakespeareTopTenWordsWithU = list(filter(startsWithU, shakespeareFreqDist))[:10]

print(f"Top ten words with 'a': {shakespeareTopTenWordsWithA}\n")
print(f"Top ten words with 'e': {shakespeareTopTenWordsWithE}\n")
print(f"Top ten words with 'i': {shakespeareTopTenWordsWithI}\n")
print(f"Top ten words with 'o': {shakespeareTopTenWordsWithO}\n")
print(f"Top ten words with 'u': {shakespeareTopTenWordsWithU}\n")

Top ten words with 'a': ['and', 'a', 'And', 'as', 'are', 'all', 'at', 'am', 'A', 'Ant']

Top ten words with 'e': ['Enter', 'else', 'Exit', 'er', 'Exeunt', 'ever', 'eyes', 'earth', 'even', 'end']

Top ten words with 'i': ['I', 'is', 'in', 'it', 'if', 'If', 'It', 'Is', 'In', 'indeed']

Top ten words with 'o': ['of', 'on', 'one', 'our', 'or', 'out', 'Out', 'Of', 'O', 'own']

Top ten words with 'u': ['upon', 'us', 'up', 'use', 'U', 'unto', 'Upon', 'under', 'Unless', 'Under']





6. A stemmed version of the longest sentence (extracted above in 3).


In [12]:
ps = PorterStemmer()
shakespeareLongestSentenceList = []

for w in word_tokenize(shakespeareLongestSentence):
    shakespeareLongestSentenceList.append(ps.stem(w))

shakespeareStemmedSentence = " ".join(shakespeareLongestSentenceList)
    
print("The stemmed version of the longest sentence (but with a space around each punctuation):\n")

print(shakespeareStemmedSentence)


The stemmed version of the longest sentence (but with a space around each punctuation):

pro , ye elv of hill , brook , stand lake , and grove ; and ye that on , the sand with printless foot do chase the eb neptun , and do fli him when he come back ; you demi-puppet that by moonshin do the green sour ringlet make , whereoftkeewenotbit ; andyouwhosepastim is to make midnight mushroom , that rejoic to hear the solemn curfew ; by whose aid , — weak master thou^ ye be , — i have bedimm ’ d thenoontidesun , call ’ dforththemutin wind , and ’ twixt the green sea and the azur vault set roar war : to the dread rattl thunder have i given fire , and rift jove ’ s stout oak with hi own bolt : the strong-bas promontori have i made shake : and by the spur pluck ’ d up the pine and cedar : grave , at my command , have wake their sleeper , ope , and let them forth by my so potent art .
