In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader

In [2]:
corpus_root = './corpus'
articles =  PlaintextCorpusReader(corpus_root, '.*', encoding="utf8")

nyt_words = articles.words('nyt.txt')
guardian_words = articles.words('guardian.txt')
gpt_words = articles.words('gpt.txt')

In [3]:
print("NYT word count: ", len(nyt_words))
print("Guardian word count: ", len(guardian_words))
print("chatGPT word count: ", len(gpt_words))

NYT word count:  9163
Guardian word count:  17808
chatGPT word count:  7889


In [4]:
def lexical_diversity(text):
    return len(set(text)) / len((text))

print("NYT lexical diversity: ", lexical_diversity(nyt_words))
print("Guardian lexical diversity: ", lexical_diversity(guardian_words))
print("chatGPT lexical diversity: ", lexical_diversity(gpt_words))

NYT lexical diversity:  0.2541743970315399
Guardian lexical diversity:  0.22310197663971248
chatGPT lexical diversity:  0.22740524781341107


In [5]:
nyt_sents = articles.sents('nyt.txt')
guardian_sents = articles.sents('guardian.txt')
gpt_sents = articles.sents('gpt.txt')

def longestSentence(text):
    longest_len = max(len(s) for s in text)
    for s in text:
        if len(s)==longest_len:
            longest_sents = s
    return longest_sents

nytLongestSents = longestSentence(nyt_sents)
guardianLongestSents = longestSentence(guardian_sents)
gptLongestSents = longestSentence(gpt_sents)

def printSentence(sentence):
    readableSentence = str()
    for s in sentence:
        readableSentence += s + ' '
    return readableSentence

print("NYT longest sentence: ", printSentence(nytLongestSents))
print("Sentence length: ", len(nytLongestSents))

print("\nGuardian longest sentence: ", printSentence(guardianLongestSents))
print("Sentence length: ", len(guardianLongestSents))

print("\nchatGPT longest sentence: ", printSentence(gptLongestSents))
print("Sentence length: ", len(gptLongestSents))

NYT longest sentence:  When Mr . Trump ’ s efforts to overturn the election results were rejected at every turn by state officials and judges , Mr . Trump was told , incorrectly , that the vice president could stop the final validation of the election of President - elect Joseph R . Biden Jr . in his role as president of the Senate presiding over the Electoral College count . 
Sentence length:  68

Guardian longest sentence:  Roland Griffiths , one of the world ’ s leading researchers of mood - altering drugs , and the man most responsible for getting the diagnosis of " caffeine withdrawal " included in the Diagnostic and Statistical Manual of Mental Disorders ( DSM - 5 ), the bible of psychiatric diagnoses , told me he hadn ’ t begun to understand his own relationship with caffeine until he stopped using it and conducted a series of self - experiments . 
Sentence length:  80

chatGPT longest sentence:  The Rich vs The Very , Very Rich : The Wentworth Golf Club Rebellion Wentworth golf

In [6]:
nyt = nltk.Text(articles.words('nyt.txt'))
guardian = nltk.Text(articles.words('guardian.txt'))
gpt = nltk.Text(articles.words('gpt.txt'))

def printCollacations(text):
    collocationwords = list()
    for item in text:
        newitem=item[0]+" "+item[1]
        collocationwords.append(newitem)
    return collocationwords

print("NYT collocations: ", printCollacations(nyt.collocation_list()))
print("\nGuardian collocations: ", printCollacations(guardian.collocation_list()))
print("\nchatGPT collocations: ", printCollacations(gpt.collocation_list()))

NYT collocations:  ['Proud Boys', 'vice president', 'mental health', 'Oval Office', 'social media', 'New York', 'White House', 'head coach', 'Mike Pence', 'last year', 'York Times', 'Vice President', 'Bay Buccaneers', 'Super Bowl', 'Telegram channel', 'Tampa Bay', 'Halyna Hutchins', 'even though', 'Telegram channels', 'Bonanza Creek']

Guardian collocations:  ['Yan Bin', 'Red Bull', 'wheel well', 'Wet Feet', 'Jomo Kenyatta', 'landing gear', 'TCP Group', 'Wentworth Estate', 'Paul Manyasi', 'Kenyatta International', 'Offerton Road', 'joint venture', 'energy drinks', 'wheel wells', 'drinks industry', 'Airways flight', 'Kenya Airways', 'South Africa', 'Kenyatta airport', 'brain damage']

chatGPT collocations:  ['Proud Boys', 'January 6th', 'mental health', 'New York', 'head coach', 'Vice President', 'Golf Club', 'Hill riots', 'election results', 'Capitol attack', 'former president', 'many people', 'invisible addiction', 'Total Failure', 'hair stylist', 'potential harm', 'unlike anything', 

In [7]:
def topVowelWords(text, vowel):
    vowelWords = [w for w in text if w.startswith(vowel)]
    return nltk.FreqDist(vowelWords).most_common(10)

print("NYT top ten vowel words: ", topVowelWords(nyt, 'a'))
print(topVowelWords(nyt, 'e'))
print(topVowelWords(nyt, 'i'))
print(topVowelWords(nyt, 'o'))
print(topVowelWords(nyt, 'u'))

print("\nGuardian top ten vowel words: ", topVowelWords(guardian, 'a'))
print(topVowelWords(guardian, 'e'))
print(topVowelWords(guardian, 'i'))
print(topVowelWords(guardian, 'o'))
print(topVowelWords(guardian, 'u'))

print("\nchatGPT top ten vowel words: ", topVowelWords(gpt, 'a'))
print(topVowelWords(gpt, 'e'))
print(topVowelWords(gpt, 'i'))
print(topVowelWords(gpt, 'o'))
print(topVowelWords(gpt, 'u'))

NYT top ten vowel words:  [('and', 197), ('a', 191), ('an', 38), ('as', 36), ('at', 29), ('are', 21), ('about', 20), ('also', 14), ('all', 11), ('after', 10)]
[('emails', 13), ('election', 11), ('even', 7), ('email', 7), ('early', 6), ('every', 5), ('executive', 5), ('earlier', 4), ('efforts', 3), ('entered', 3)]
[('in', 136), ('it', 46), ('is', 37), ('into', 14), ('its', 9), ('investigation', 8), ('including', 7), ('if', 6), ('interview', 3), ('includes', 3)]
[('of', 194), ('on', 74), ('or', 27), ('office', 19), ('out', 16), ('one', 16), ('our', 14), ('other', 9), ('over', 8), ('officials', 6)]
[('up', 15), ('us', 12), ('used', 8), ('urged', 5), ('use', 5), ('until', 4), ('ultimately', 3), ('using', 3), ('understand', 2), ('under', 2)]

Guardian top ten vowel words:  [('a', 376), ('and', 361), ('as', 91), ('at', 66), ('an', 62), ('after', 34), ('all', 34), ('are', 33), ('about', 32), ('airport', 20)]
[('energy', 12), ('even', 9), ('enough', 7), ('each', 7), ('eventually', 7), ('estate

In [8]:
snowball = nltk.SnowballStemmer(language='english')

nytStemmed = [snowball.stem(w) for w in nytLongestSents]
guardianStemmed = [snowball.stem(w) for w in guardianLongestSents]
gptStemmed = [snowball.stem(w) for w in gptLongestSents]

print("NYT stemmed sentence: ", printSentence(nytStemmed))
print("\nGuardian stemmed sentence: ", printSentence(guardianStemmed))
print("\nchatGPT stemmed sentence: ", printSentence(gptStemmed))


NYT stemmed sentence:  when mr . trump ’ s effort to overturn the elect result were reject at everi turn by state offici and judg , mr . trump was told , incorrect , that the vice presid could stop the final valid of the elect of presid - elect joseph r . biden jr . in his role as presid of the senat presid over the elector colleg count . 

Guardian stemmed sentence:  roland griffith , one of the world ’ s lead research of mood - alter drug , and the man most respons for get the diagnosi of " caffein withdraw " includ in the diagnost and statist manual of mental disord ( dsm - 5 ), the bibl of psychiatr diagnos , told me he hadn ’ t begun to understand his own relationship with caffein until he stop use it and conduct a seri of self - experi . 

chatGPT stemmed sentence:  the rich vs the veri , veri rich : the wentworth golf club rebellion wentworth golf club , locat in the affluent suburb of virginia water , surrey , has long been a symbol of the british establish , with it well - man