In [1]:
import nltk
import re
from nltk.collocations import *
from nltk.stem import PorterStemmer
from nltk import word_tokenize

|   Name (Last, First)   |   Student ID   |   Section contributed                                          |   Section edited                                   |   Other contributions                                                      |
|------------------------|----------------|----------------------------------------------------------------|----------------------------------------------------|----------------------------------------------------------------------------|
|    Jiang Long          |    200099436   |   Contributed to all sections.    (Typed on Jupiter notebook)  |   All sections                                     |   Submission of the text files and assignment files, Truncated text files  |
|    Antanila H.         |    301332035   |   Research of texts, contributed more on 2, 3 sections         |   Overall input in all sections, more on 2 and 3   |   Found different codes to input from the internet and lab assignments     |
|    Sava Savkovic       |    301397121   |   Contributed to all sections.                                 |   All sections                                     |   Found different codes to input from the internet and lab assignments     |

## Subcorpus 1: The complete works of William Shakespeare

The text is downloaded in .txt format from archive.org: 
https://archive.org/details/completeworksofw00shakrich

The file is truncated to include only the first 9,848 lines because we have slow laptops. :-(

In [2]:
# Grab the text
shakespeareFile = open("data/literature-shakespeare-trunc.txt", "r")
shakespeareText = shakespeareFile.read()
shakespeareTokenized = word_tokenize(shakespeareText)

1. The length (in words).

In [3]:
shakespeareTokens = len(shakespeareTokenized)
print(shakespeareTokens)

49497


2. The lexical diversity.

In [4]:
shakespeareTypes = len(set(shakespeareTokenized))
shakespeareLexDiversity = shakespeareTypes / shakespeareTokens
print(shakespeareLexDiversity)

0.1317655615491848




3. The longest sentence (type the sentence and also give the number of words). Hint: look at the Gutenberg part of Section 2.1 in NLTK.


In [5]:
shakespeareSentences = nltk.sent_tokenize(shakespeareText)
print("The longest 'sentence' is:\n\n")

# We got this from https://stackoverflow.com/questions/27652187/python-finding-the-longest-shortest-sentence-in-a-random-paragraph
word_count = lambda sentence: len(word_tokenize(sentence))
shakespeareLongestSentence = max(shakespeareSentences, key=word_count)

print(shakespeareLongestSentence)
print("\n\n")
print(f"The sentence has {word_count(shakespeareLongestSentence)} words.")


The longest 'sentence' is:


Pro, Ye elves of hills, brooks, standing lakes, 
and groves; 

And ye that on, the sands with printless foot 
Do chase the ebbing Neptune, and do fly him 
When he comes back; you demi-puppets that 
By moonshine do the green sour ringlets make, 
Whereoftkeewenotbites;andyouwhosepastime 
Is to make midnight mushrooms, that rejoice 
To hear the solemn curfew; by whose aid, — 
Weak masters thou^ ye be,— I haVe bedimm’d 
Thenoontidesun,call’dforththemutinous winds, 
And ’twixt the green sea and the azured vault 
Set roaring war: to the dread rattling thunder 
Have I given fire, and rifted Jove’s stout oak 
With his own bolt: the strong-based promontory 
Have I made shake: and by the spurs pluck’d up 
The pine and cedar: graves, at my command, 
Have waked their sleepers, oped, and let them 
forth 

By my so potent art.



The sentence has 173 words.



4. The top collocations.

In [6]:
shakespeareNLTKText = nltk.Text(shakespeareTokenized)
shakespeareNLTKText.collocations(num=20)

TWO GENTLEMEN; thou art; Sir Proteus; thou hast; Sir Thurio; thou
canst; Scene I.—; Sir Valentine; Sir John; Master Page; thou beest;
widow Dido; Thou liest; Wilt thou; pray thee; PERSONS REPRESENTED;
Pro- teus; John Falstaff; Thou hast; Dost thou


5. The top ten words that start with each of the vowels (involves using FreqDist).

In [7]:
shakespeareFreqDist = nltk.probability.FreqDist(shakespeareNLTKText)

def startsWithA(word):
    return re.search(f"^[Aa]", word) != None

shakespeareTopTenWordsWithA = list(filter(startsWithA, shakespeareFreqDist))[:10]

def startsWithE(word):
    return re.search(f"^[Ee]", word) != None

shakespeareTopTenWordsWithE = list(filter(startsWithE, shakespeareFreqDist))[:10]

def startsWithI(word):
    return re.search(f"^[Ii]", word) != None

shakespeareTopTenWordsWithI = list(filter(startsWithI, shakespeareFreqDist))[:10]

def startsWithO(word):
    return re.search(f"^[Oo]", word) != None

shakespeareTopTenWordsWithO = list(filter(startsWithO, shakespeareFreqDist))[:10]

def startsWithU(word):
    return re.search(f"^[Uu]", word) != None

shakespeareTopTenWordsWithU = list(filter(startsWithU, shakespeareFreqDist))[:10]

print(f"Top ten words with 'a': {shakespeareTopTenWordsWithA}\n")
print(f"Top ten words with 'e': {shakespeareTopTenWordsWithE}\n")
print(f"Top ten words with 'i': {shakespeareTopTenWordsWithI}\n")
print(f"Top ten words with 'o': {shakespeareTopTenWordsWithO}\n")
print(f"Top ten words with 'u': {shakespeareTopTenWordsWithU}\n")

Top ten words with 'a': ['and', 'a', 'And', 'as', 'are', 'all', 'at', 'am', 'A', 'Ant']

Top ten words with 'e': ['Enter', 'else', 'Exit', 'er', 'Exeunt', 'ever', 'eyes', 'earth', 'even', 'end']

Top ten words with 'i': ['I', 'is', 'in', 'it', 'if', 'If', 'It', 'Is', 'In', 'indeed']

Top ten words with 'o': ['of', 'on', 'one', 'our', 'or', 'out', 'Out', 'Of', 'O', 'own']

Top ten words with 'u': ['upon', 'us', 'up', 'use', 'U', 'unto', 'Upon', 'under', 'Unless', 'Under']





6. A stemmed version of the longest sentence (extracted above in 3).


In [8]:
ps = PorterStemmer()
shakespeareLongestSentenceList = []

for w in nltk.word_tokenize(shakespeareLongestSentence):
    shakespeareLongestSentenceList.append(ps.stem(w))

shakespeareStemmedSentence = " ".join(shakespeareLongestSentenceList)
    
print("The stemmed version of the longest sentence (but with a space around each punctuation):\n")

print(shakespeareStemmedSentence)


The stemmed version of the longest sentence (but with a space around each punctuation):

pro , ye elv of hill , brook , stand lake , and grove ; and ye that on , the sand with printless foot do chase the eb neptun , and do fli him when he come back ; you demi-puppet that by moonshin do the green sour ringlet make , whereoftkeewenotbit ; andyouwhosepastim is to make midnight mushroom , that rejoic to hear the solemn curfew ; by whose aid , — weak master thou^ ye be , — i have bedimm ’ d thenoontidesun , call ’ dforththemutin wind , and ’ twixt the green sea and the azur vault set roar war : to the dread rattl thunder have i given fire , and rift jove ’ s stout oak with hi own bolt : the strong-bas promontori have i made shake : and by the spur pluck ’ d up the pine and cedar : grave , at my command , have wake their sleeper , ope , and let them forth by my so potent art .


## Subcorpus 2: PBS News Transcripts

The text is extracted from 1,000 news videos from PBS News Hour and Washington Week, available at https://www.zerotohero.ca/zh/en/show/talk/293

In [8]:
# Grab the text
newsFile = open("data/news-1000-trunc.txt", "r")
newsText = newsFile.read()
newsTokenized = word_tokenize(newsText)

1. The length (in words).

In [9]:
newsTokens = len(newsTokenized)
print(newsTokens)

137356


2. The lexical diversity.

In [10]:
newsTypes = len(set(newsTokenized))
newsLexDiversity = newsTypes / newsTokens
print(newsLexDiversity)

0.0805425318151373




3. The longest sentence (type the sentence and also give the number of words). Hint: look at the Gutenberg part of Section 2.1 in NLTK.


In [11]:
newsSentences = nltk.sent_tokenize(newsText)
word_count = lambda sentence: len(nltk.word_tokenize(sentence))
print("The longest 'sentence' is:\n\n")
newsLongestSentence = max(newsSentences, key=word_count)
print(newsLongestSentence)
print("\n\n")
print(f"The sentence has {word_count(newsLongestSentence)} words.")


The longest 'sentence' is:


NICK SCHIFRIN: Last month, the Pentagon made
public a 2020 report that admitted: “White supremacy and white nationalism pose a threat
to the good order and discipline within the military and individuals with extremist affiliations
and military experience are a concern to U.S. national security.” But it also concluded: “We believe we have
been effective at screening for individuals who possess or advocate extremist ideologies.” LECIA BROOKS, Chief of Staff, Southern Poverty
Law Center: We’re happy to see that the Pentagon agrees that there’s a problem, but we completely
disagree that they’re doing anything about it.



The sentence has 115 words.



4. The top collocations.

In [12]:
newsNLTKText = nltk.Text(newsTokenized)
newsNLTKText.collocations(num=20)

JUDY WOODRUFF; NICK SCHIFRIN; JOHN YANG; AMNA NAWAZ; United States;
YAMICHE ALCINDOR; White House; WILLIAM BRANGHAM; PBS NewsHour;
President Biden; PAUL SOLMAN; New York; SAM LAZARO; George Floyd; Hong
Kong; Biden administration; JEFFREY BROWN; Derek Chauvin; Supreme
Court; Judy Woodruff


5. The top ten words that start with each of the vowels (involves using FreqDist).

In [13]:
newsFreqDist = nltk.probability.FreqDist(newsNLTKText)

def startsWithA(word):
    return re.search(f"^[Aa]", word) != None

newsTopTenWordsWithA = list(filter(startsWithA, newsFreqDist))[:10]

def startsWithE(word):
    return re.search(f"^[Ee]", word) != None

newsTopTenWordsWithE = list(filter(startsWithE, newsFreqDist))[:10]

def startsWithI(word):
    return re.search(f"^[Ii]", word) != None

newsTopTenWordsWithI = list(filter(startsWithI, newsFreqDist))[:10]

def startsWithO(word):
    return re.search(f"^[Oo]", word) != None

newsTopTenWordsWithO = list(filter(startsWithO, newsFreqDist))[:10]

def startsWithU(word):
    return re.search(f"^[Uu]", word) != None

newsTopTenWordsWithU = list(filter(startsWithU, newsFreqDist))[:10]

print(f"Top ten words with 'a': {newsTopTenWordsWithA}\n")
print(f"Top ten words with 'e': {newsTopTenWordsWithE}\n")
print(f"Top ten words with 'i': {newsTopTenWordsWithI}\n")
print(f"Top ten words with 'o': {newsTopTenWordsWithO}\n")
print(f"Top ten words with 'u': {newsTopTenWordsWithU}\n")

Top ten words with 'a': ['and', 'a', 'And', 'are', 'as', 'at', 'about', 'an', 'all', 'also']

Top ten words with 'e': ['even', 'every', 'end', 'early', 'election', 'everything', 'economy', 'especially', 'example', 'enough']

Top ten words with 'i': ['in', 'is', 'I', 'it', 'It', 'if', 'In', 'into', 'its', 'important']

Top ten words with 'o': ['of', 'on', 'out', 'or', 'one', 'our', 'other', 'over', 'only', 'own']

Top ten words with 'u': ['up', 'us', 'U.S.', 'United', 'use', 'under', 'until', 'University', 'understand', 'used']





6. A stemmed version of the longest sentence (extracted above in 3).


In [14]:
ps = PorterStemmer()
newsLongestSentenceList = []

for w in nltk.word_tokenize(newsLongestSentence):
    newsLongestSentenceList.append(ps.stem(w))

newsStemmedSentence = " ".join(newsLongestSentenceList)
    
print("The stemmed version of the longest sentence (but with a space around each punctuation):\n")

print(newsStemmedSentence)


The stemmed version of the longest sentence (but with a space around each punctuation):

nick schifrin : last month , the pentagon made public a 2020 report that admit : “ white supremaci and white nation pose a threat to the good order and disciplin within the militari and individu with extremist affili and militari experi are a concern to u.s. nation security. ” but it also conclud : “ we believ we have been effect at screen for individu who possess or advoc extremist ideologies. ” lecia brook , chief of staff , southern poverti law center : we ’ re happi to see that the pentagon agre that there ’ s a problem , but we complet disagre that they ’ re do anyth about it .


## Subcorpus 3: Straight Outta Compton

This is Straight Outta Compton Screenplay by Jonathan Herman and Andrea Berloff, downloaded from https://archive.org/details/StraightOuttaComptonScreenplayByJonathanHermanAndAndreaBerloff

In [15]:
# Grab the text
comptonFile = open("data/straight-outta-compton.txt", "r")
comptonText = comptonFile.read()
comptonTokenized = word_tokenize(comptonText)

1. The length (in words).

In [16]:
comptonTokens = len(comptonTokenized)
print(comptonTokens)

40887


2. The lexical diversity.

In [17]:
comptonTypes = len(set(comptonTokenized))
comptonLexDiversity = comptonTypes / comptonTokens
print(comptonLexDiversity)

0.1398243940616822




3. The longest sentence (type the sentence and also give the number of words). Hint: look at the Gutenberg part of Section 2.1 in NLTK.


In [18]:
comptonSentences = nltk.sent_tokenize(comptonText)
word_count = lambda sentence: len(nltk.word_tokenize(sentence))
print("The longest 'sentence' is:\n\n")
comptonLongestSentence = max(comptonSentences, key=word_count)
print(comptonLongestSentence)
print("\n\n")
print(f"The sentence has {word_count(comptonLongestSentence)} words.")


The longest 'sentence' is:


PROD #02443 


Director: F. Gary Gray 
Producer: Ice Cube 
Producer: Tomica Woods-Wright 
Producer: Matt Alvarez 
Producer: F. Gary Gray 
Producer: Scott Bernstein 
Producer: Dr. Dre 
Executive Producer: Adam Merims 


STRAIGHT OUTTA COMPTON 


Screenplay by 

Jonathan Herman and Andrea Berloff 
Story by 

S. Leigh Savidge & Alan Wenkus and Andrea Berloff 


Notice : 

This material is the property of Straight Outta LLC (A wholly 
owned subsidiary of Universal City Studios, Inc.) and is intended 
and restricted solely for studio use by studio personnel.



The sentence has 95 words.



4. The top collocations.

In [19]:
comptonNLTKText = nltk.Text(comptonTokenized)
comptonNLTKText.collocations(num=20)

BRYAN TURNER; LOS ANGELES; Death Row; AUDIO ACHIEVEMENTS; TORRANCE
COP; STRAIGHT OUTTA; OUTTA COMPTON; TOUR BUS; ACHIEVEMENTS STUDIO;
MOMENTS LATER; LENCH MOB; Jerry Heller; JIMMY IOVINE; n't even; JERRY
HELLER; n't believe; n't know; DEATH ROW; Jheri curl; NEW YORK


5. The top ten words that start with each of the vowels (involves using FreqDist).

In [20]:
comptonFreqDist = nltk.probability.FreqDist(comptonNLTKText)

def startsWithA(word):
    return re.search(f"^[Aa]", word) != None

comptonTopTenWordsWithA = list(filter(startsWithA, comptonFreqDist))[:10]

def startsWithE(word):
    return re.search(f"^[Ee]", word) != None

comptonTopTenWordsWithE = list(filter(startsWithE, comptonFreqDist))[:10]

def startsWithI(word):
    return re.search(f"^[Ii]", word) != None

comptonTopTenWordsWithI = list(filter(startsWithI, comptonFreqDist))[:10]

def startsWithO(word):
    return re.search(f"^[Oo]", word) != None

comptonTopTenWordsWithO = list(filter(startsWithO, comptonFreqDist))[:10]

def startsWithU(word):
    return re.search(f"^[Uu]", word) != None

comptonTopTenWordsWithU = list(filter(startsWithU, comptonFreqDist))[:10]

print(f"Top ten words with 'a': {comptonTopTenWordsWithA}\n")
print(f"Top ten words with 'e': {comptonTopTenWordsWithE}\n")
print(f"Top ten words with 'i': {comptonTopTenWordsWithI}\n")
print(f"Top ten words with 'o': {comptonTopTenWordsWithO}\n")
print(f"Top ten words with 'u': {comptonTopTenWordsWithU}\n")

Top ten words with 'a': ['a', 'and', 'at', 'as', 'all', 'about', 'A', 'are', 'And', 'around']

Top ten words with 'e': ['Eazy', 'EAZY', 'eyes', 'EXT', 'Eric', 'even', 'each', 'exits', 'ever', 'everything']

Top ten words with 'i': ['I', 'in', 'it', 'is', 'INT', 'It', 'into', 'if', 'IN', 'INTO']

Top ten words with 'o': ['of', 'on', 'out', 'over', 'off', 'one', 'other', 'ON', 'or', 'OF']

Top ten words with 'u': ['up', 'us', 'UP', 'uckin', 'under', 'until', 'Until', 'upon', 'UNIFORM', 'ucka']





6. A stemmed version of the longest sentence (extracted above in 3).


In [21]:
ps = PorterStemmer()
comptonLongestSentenceList = []

for w in nltk.word_tokenize(comptonLongestSentence):
    comptonLongestSentenceList.append(ps.stem(w))

comptonStemmedSentence = " ".join(comptonLongestSentenceList)
    
print("The stemmed version of the longest sentence (but with a space around each punctuation):\n")

print(comptonStemmedSentence)


The stemmed version of the longest sentence (but with a space around each punctuation):

prod # 02443 director : f. gari gray produc : ice cube produc : tomica woods-wright produc : matt alvarez produc : f. gari gray produc : scott bernstein produc : dr. dre execut produc : adam merim straight outta compton screenplay by jonathan herman and andrea berloff stori by s. leigh savidg & alan wenku and andrea berloff notic : thi materi is the properti of straight outta llc ( a wholli own subsidiari of univers citi studio , inc. ) and is intend and restrict sole for studio use by studio personnel .
