In [7]:
import nltk
import re
from nltk.collocations import *
from nltk.stem import PorterStemmer
from nltk import word_tokenize

## Subcorpus 3: Straight Outta Compton

This is Straight Outta Compton Screenplay by Jonathan Herman and Andrea Berloff, downloaded from https://archive.org/details/StraightOuttaComptonScreenplayByJonathanHermanAndAndreaBerloff

In [8]:
# Grab the text
comptonFile = open("data/straight-outta-compton.txt", "r")
comptonText = comptonFile.read()
comptonTokenized = word_tokenize(comptonText)

1. The length (in words).

In [9]:
comptonTokens = len(comptonTokenized)
print(comptonTokens)

40887


2. The lexical diversity.

In [10]:
comptonTypes = len(set(comptonTokenized))
comptonLexDiversity = comptonTypes / comptonTokens
print(comptonLexDiversity)

0.1398243940616822




3. The longest sentence (type the sentence and also give the number of words). Hint: look at the Gutenberg part of Section 2.1 in NLTK.


In [11]:
comptonSentences = nltk.sent_tokenize(comptonText)
word_count = lambda sentence: len(nltk.word_tokenize(sentence))
print("The longest 'sentence' is:\n\n")
comptonLongestSentence = max(comptonSentences, key=word_count)
print(comptonLongestSentence)
print("\n\n")
print(f"The sentence has {word_count(comptonLongestSentence)} words.")


The longest 'sentence' is:


PROD #02443 


Director: F. Gary Gray 
Producer: Ice Cube 
Producer: Tomica Woods-Wright 
Producer: Matt Alvarez 
Producer: F. Gary Gray 
Producer: Scott Bernstein 
Producer: Dr. Dre 
Executive Producer: Adam Merims 


STRAIGHT OUTTA COMPTON 


Screenplay by 

Jonathan Herman and Andrea Berloff 
Story by 

S. Leigh Savidge & Alan Wenkus and Andrea Berloff 


Notice : 

This material is the property of Straight Outta LLC (A wholly 
owned subsidiary of Universal City Studios, Inc.) and is intended 
and restricted solely for studio use by studio personnel.



The sentence has 95 words.



4. The top collocations.

In [12]:
comptonNLTKText = nltk.Text(comptonTokenized)
comptonNLTKText.collocations(num=20)

BRYAN TURNER; LOS ANGELES; Death Row; AUDIO ACHIEVEMENTS; TORRANCE
COP; STRAIGHT OUTTA; OUTTA COMPTON; TOUR BUS; ACHIEVEMENTS STUDIO;
MOMENTS LATER; LENCH MOB; Jerry Heller; JIMMY IOVINE; n't even; JERRY
HELLER; n't believe; n't know; DEATH ROW; Jheri curl; NEW YORK


5. The top ten words that start with each of the vowels (involves using FreqDist).

In [13]:
comptonFreqDist = nltk.probability.FreqDist(comptonNLTKText)

def startsWithA(word):
    return re.search(f"^[Aa]", word) != None

comptonTopTenWordsWithA = list(filter(startsWithA, comptonFreqDist))[:10]

def startsWithE(word):
    return re.search(f"^[Ee]", word) != None

comptonTopTenWordsWithE = list(filter(startsWithE, comptonFreqDist))[:10]

def startsWithI(word):
    return re.search(f"^[Ii]", word) != None

comptonTopTenWordsWithI = list(filter(startsWithI, comptonFreqDist))[:10]

def startsWithO(word):
    return re.search(f"^[Oo]", word) != None

comptonTopTenWordsWithO = list(filter(startsWithO, comptonFreqDist))[:10]

def startsWithU(word):
    return re.search(f"^[Uu]", word) != None

comptonTopTenWordsWithU = list(filter(startsWithU, comptonFreqDist))[:10]

print(f"Top ten words with 'a': {comptonTopTenWordsWithA}\n")
print(f"Top ten words with 'e': {comptonTopTenWordsWithE}\n")
print(f"Top ten words with 'i': {comptonTopTenWordsWithI}\n")
print(f"Top ten words with 'o': {comptonTopTenWordsWithO}\n")
print(f"Top ten words with 'u': {comptonTopTenWordsWithU}\n")

Top ten words with 'a': ['a', 'and', 'at', 'as', 'all', 'about', 'A', 'are', 'And', 'around']

Top ten words with 'e': ['Eazy', 'EAZY', 'eyes', 'EXT', 'Eric', 'even', 'each', 'exits', 'ever', 'everything']

Top ten words with 'i': ['I', 'in', 'it', 'is', 'INT', 'It', 'into', 'if', 'IN', 'INTO']

Top ten words with 'o': ['of', 'on', 'out', 'over', 'off', 'one', 'other', 'ON', 'or', 'OF']

Top ten words with 'u': ['up', 'us', 'UP', 'uckin', 'under', 'until', 'Until', 'upon', 'UNIFORM', 'ucka']





6. A stemmed version of the longest sentence (extracted above in 3).


In [14]:
ps = PorterStemmer()
comptonLongestSentenceList = []

for w in nltk.word_tokenize(comptonLongestSentence):
    comptonLongestSentenceList.append(ps.stem(w))

comptonStemmedSentence = " ".join(comptonLongestSentenceList)
    
print("The stemmed version of the longest sentence (but with a space around each punctuation):\n")

print(comptonStemmedSentence)


The stemmed version of the longest sentence (but with a space around each punctuation):

prod # 02443 director : f. gari gray produc : ice cube produc : tomica woods-wright produc : matt alvarez produc : f. gari gray produc : scott bernstein produc : dr. dre execut produc : adam merim straight outta compton screenplay by jonathan herman and andrea berloff stori by s. leigh savidg & alan wenku and andrea berloff notic : thi materi is the properti of straight outta llc ( a wholli own subsidiari of univers citi studio , inc. ) and is intend and restrict sole for studio use by studio personnel .
