In [28]:
import nltk
import re
from nltk.collocations import *
from nltk.stem import PorterStemmer
from nltk import word_tokenize

## Subcorpus 2: PBS News Transcripts

The text is extracted from 1,000 news videos from PBS News Hour and Washington Week, available at https://www.zerotohero.ca/zh/en/show/talk/293

In [29]:
# Grab the text
newsFile = open("data/news-1000-trunc.txt", "r")
newsText = newsFile.read()
newsTokenized = word_tokenize(newsText)

1. The length (in words).

In [30]:
newsTokens = len(newsTokenized)
print(newsTokens)

137356


2. The lexical diversity.

In [31]:
newsTypes = len(set(newsTokenized))
newsLexDiversity = newsTypes / newsTokens
print(newsLexDiversity)

0.0805425318151373




3. The longest sentence (type the sentence and also give the number of words). Hint: look at the Gutenberg part of Section 2.1 in NLTK.


In [32]:
newsSentences = nltk.sent_tokenize(newsText)
word_count = lambda sentence: len(nltk.word_tokenize(sentence))
print("The longest 'sentence' is:\n\n")
newsLongestSentence = max(newsSentences, key=word_count)
print(newsLongestSentence)
print("\n\n")
print(f"The sentence has {word_count(newsLongestSentence)} words.")


The longest 'sentence' is:


NICK SCHIFRIN: Last month, the Pentagon made
public a 2020 report that admitted: “White supremacy and white nationalism pose a threat
to the good order and discipline within the military and individuals with extremist affiliations
and military experience are a concern to U.S. national security.” But it also concluded: “We believe we have
been effective at screening for individuals who possess or advocate extremist ideologies.” LECIA BROOKS, Chief of Staff, Southern Poverty
Law Center: We’re happy to see that the Pentagon agrees that there’s a problem, but we completely
disagree that they’re doing anything about it.



The sentence has 115 words.



4. The top collocations.

In [33]:
newsNLTKText = nltk.Text(newsTokenized)
newsNLTKText.collocations(num=20)

JUDY WOODRUFF; NICK SCHIFRIN; JOHN YANG; AMNA NAWAZ; United States;
YAMICHE ALCINDOR; White House; WILLIAM BRANGHAM; PBS NewsHour;
President Biden; PAUL SOLMAN; New York; SAM LAZARO; George Floyd; Hong
Kong; Biden administration; JEFFREY BROWN; Derek Chauvin; Supreme
Court; Judy Woodruff


5. The top ten words that start with each of the vowels (involves using FreqDist).

In [34]:
newsFreqDist = nltk.probability.FreqDist(newsNLTKText)

def startsWithA(word):
    return re.search(f"^[Aa]", word) != None

newsTopTenWordsWithA = list(filter(startsWithA, newsFreqDist))[:10]

def startsWithE(word):
    return re.search(f"^[Ee]", word) != None

newsTopTenWordsWithE = list(filter(startsWithE, newsFreqDist))[:10]

def startsWithI(word):
    return re.search(f"^[Ii]", word) != None

newsTopTenWordsWithI = list(filter(startsWithI, newsFreqDist))[:10]

def startsWithO(word):
    return re.search(f"^[Oo]", word) != None

newsTopTenWordsWithO = list(filter(startsWithO, newsFreqDist))[:10]

def startsWithU(word):
    return re.search(f"^[Uu]", word) != None

newsTopTenWordsWithU = list(filter(startsWithU, newsFreqDist))[:10]

print(f"Top ten words with 'a': {newsTopTenWordsWithA}\n")
print(f"Top ten words with 'e': {newsTopTenWordsWithE}\n")
print(f"Top ten words with 'i': {newsTopTenWordsWithI}\n")
print(f"Top ten words with 'o': {newsTopTenWordsWithO}\n")
print(f"Top ten words with 'u': {newsTopTenWordsWithU}\n")

Top ten words with 'a': ['and', 'a', 'And', 'are', 'as', 'at', 'about', 'an', 'all', 'also']

Top ten words with 'e': ['even', 'every', 'end', 'early', 'election', 'everything', 'economy', 'especially', 'example', 'enough']

Top ten words with 'i': ['in', 'is', 'I', 'it', 'It', 'if', 'In', 'into', 'its', 'important']

Top ten words with 'o': ['of', 'on', 'out', 'or', 'one', 'our', 'other', 'over', 'only', 'own']

Top ten words with 'u': ['up', 'us', 'U.S.', 'United', 'use', 'under', 'until', 'University', 'understand', 'used']





6. A stemmed version of the longest sentence (extracted above in 3).


In [35]:
ps = PorterStemmer()
newsLongestSentenceList = []

for w in nltk.word_tokenize(newsLongestSentence):
    newsLongestSentenceList.append(ps.stem(w))

newsStemmedSentence = " ".join(newsLongestSentenceList)
    
print("The stemmed version of the longest sentence (but with a space around each punctuation):\n")

print(newsStemmedSentence)


The stemmed version of the longest sentence (but with a space around each punctuation):

nick schifrin : last month , the pentagon made public a 2020 report that admit : “ white supremaci and white nation pose a threat to the good order and disciplin within the militari and individu with extremist affili and militari experi are a concern to u.s. nation security. ” but it also conclud : “ we believ we have been effect at screen for individu who possess or advoc extremist ideologies. ” lecia brook , chief of staff , southern poverti law center : we ’ re happi to see that the pentagon agre that there ’ s a problem , but we complet disagre that they ’ re do anyth about it .
