In [1]:
# HW2 

# Getting started to process a text example
import nltk
from nltk import FreqDist
import re

In [2]:
#Hemingway Analysis
hemingway = open('C:\\Users\\maria\\inourtime.txt')
text = hemingway.read()
len(text) 

165556

In [3]:
#%%

## CODE IDEAS FOR HMW 2, Exploratory exercise for sentiment analysis
# finding adverb and adjective phrases, and computing basic statistics

# importing required nltk libraries
import nltk
from nltk import sent_tokenize

# loading our corpus (for this example: "Crime and Punishment," by F. Dostoevsky)
#f = open('CrimeAndPunishment.txt')
#text = f.read()
#print(text[:150])

In [4]:
# Preprocessing, as explained in the Labs
# Separate the text into sentences first
textsplit = nltk.sent_tokenize(text)
print(textsplit[1:2])

['The whole battery was drunk going along\nthe\nroad in the dark.']


In [5]:
#%%

# Apply the word tokenizer to each sentence
tokentext = [nltk.word_tokenize(sent) for sent in textsplit]
print(tokentext[:2])
#the output is a list of strings that contains the sentences
type(tokentext)
len(tokentext)

[['IN', 'OUR', 'TIME', 'Everybody', 'was', 'drunk', '.'], ['The', 'whole', 'battery', 'was', 'drunk', 'going', 'along', 'the', 'road', 'in', 'the', 'dark', '.']]


2750

In [6]:
#%%

## POS Tagging, to retrieve adjective (JJs) and adverb (RBs) tags

# use the Stanford POS tagger to POS tag tokens of each sentence
# this is the default tagger in nltk

taggedtext = [nltk.pos_tag(tokens) for tokens in tokentext]
print(taggedtext[:2])

[[('IN', 'NNP'), ('OUR', 'NNP'), ('TIME', 'NNP'), ('Everybody', 'NNP'), ('was', 'VBD'), ('drunk', 'VBN'), ('.', '.')], [('The', 'DT'), ('whole', 'JJ'), ('battery', 'NN'), ('was', 'VBD'), ('drunk', 'JJ'), ('going', 'VBG'), ('along', 'IN'), ('the', 'DT'), ('road', 'NN'), ('in', 'IN'), ('the', 'DT'), ('dark', 'NN'), ('.', '.')]]


In [7]:
#%%

# Following our NLTK textbook, chapter on Information Extraction--Chunking (https://www.nltk.org/book/ch07.html)

# Using CHUNKING to parse sentences 
# to look for "adjective phrases", i.e. phrases (or chunks) that have adverbs and adjectives ('RB'+'JJ')
# First step: writing a grammar that defines the POS in the chunk
# we name this grammar "ADJPH" ("ADJective PHrase") using regexes 

import re
grammar_adjph = "ADJPH: {<RB.?>+<JJ.?>}"
# This regex reads as: "find groups ("< >") of RBs (adverbs) together with groups of JJs (adjectives), with groups defineds as
# RBs with any ending (the "." is a placeholder or wildcard for the "R" and the "S" at the end of RBR and RBS, 
# while "?" indicates "optional character" so RB can be found alone as well). Same regex operators apply to JJs.

# Second step: import the nltk parser to process each sentence
chunk_parser_adj = nltk.RegexpParser(grammar_adjph)

adjph_tags = []
for sent in taggedtext:
    if len(sent) > 0:
        tree = chunk_parser_adj.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'ADJPH':
                adjph_tags.append(subtree)
                

In [8]:
# Visualizing the actual adjective phrase
adjective_phrases = []
for sent in adjph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    adjective_phrases.append(temp)
    
print('First 10 adjective phrases: ', adjective_phrases[:222])

First 10 adjective phrases:  ['so soused ', 'very sick ', 'very big ', 'very bad ', 'not important ', 'not important ', 'very pale ', 'away wet ', 'terribly sorry ', 'very exceptional ', "'Do many ", 'very many ', "'Do many ", 'quite sure ', 'not worth ', 'very lazy ', 'very uncomfortable ', 'very serious ', 'so much ', 'awfully surprised ', "n't striking ", 'about right ', 'not quite dark ', 'afrightfully hot ', 'absolutely perfect ', 'simply priceless ', 'Too heavy ', 'absolutely perfect ', 'very jine ', 'sometimes slept ', 'pretty good ', "'How much ", "n't practical ", 'consciously practical ', 'quite proud ', 'thoroughly practical ', 'awfully big ', 'very fine ', 'very wise ', 'probably bad ', "n't drunk ", "n't engaged/ ", "n't engaged/ ", 'really drunk ', 'so absolute ', 'still quite drunk ', 'no longer so tragic ', 'not even very important ', 'quite close ', 'not quite right ', 'too many ', "n't brother ", 'very hot ', 'not enOtigh ', 'very fiat ', 'too big ', 'not beautiful ',

In [9]:
# Following our NLTK textbook, chapter 1 on Language Processing (https://www.nltk.org/book/ch01.html)

## FREQUENCY DISTRIBUTIONS
# Top 50 adjective phrases
freq_adjph = nltk.FreqDist(adjective_phrases)

print('Top adjective phrases by frequency: ')
for word, freq in freq_adjph.most_common(50):
    print(word, freq)

            

Top adjective phrases by frequency: 
not worth  3
too much  3
not important  2
'Do many  2
very serious  2
so much  2
absolutely perfect  2
pretty good  2
very fine  2
n't engaged/  2
too many  2
very hot  2
too big  2
n't worth  2
quite dark  2
so big  2
then smaller  2
very hungry  2
too hot  2
so soused  1
very sick  1
very big  1
very bad  1
very pale  1
away wet  1
terribly sorry  1
very exceptional  1
very many  1
quite sure  1
very lazy  1
very uncomfortable  1
awfully surprised  1
n't striking  1
about right  1
not quite dark  1
afrightfully hot  1
simply priceless  1
Too heavy  1
very jine  1
sometimes slept  1
'How much  1
n't practical  1
consciously practical  1
quite proud  1
thoroughly practical  1
awfully big  1
very wise  1
probably bad  1
n't drunk  1
really drunk  1


In [10]:
#print the list of our sentences:
print('Length of adjective phrase sentences: ', len(adjph_tags))
print(adjph_tags)


Length of adjective phrase sentences:  160
[Tree('ADJPH', [('so', 'RB'), ('soused', 'JJ')]), Tree('ADJPH', [('very', 'RB'), ('sick', 'JJ')]), Tree('ADJPH', [('very', 'RB'), ('big', 'JJ')]), Tree('ADJPH', [('very', 'RB'), ('bad', 'JJ')]), Tree('ADJPH', [('not', 'RB'), ('important', 'JJ')]), Tree('ADJPH', [('not', 'RB'), ('important', 'JJ')]), Tree('ADJPH', [('very', 'RB'), ('pale', 'JJ')]), Tree('ADJPH', [('away', 'RB'), ('wet', 'JJ')]), Tree('ADJPH', [('terribly', 'RB'), ('sorry', 'JJ')]), Tree('ADJPH', [('very', 'RB'), ('exceptional', 'JJ')]), Tree('ADJPH', [("'Do", 'RB'), ('many', 'JJ')]), Tree('ADJPH', [('very', 'RB'), ('many', 'JJ')]), Tree('ADJPH', [("'Do", 'RB'), ('many', 'JJ')]), Tree('ADJPH', [('quite', 'RB'), ('sure', 'JJ')]), Tree('ADJPH', [('not', 'RB'), ('worth', 'JJ')]), Tree('ADJPH', [('very', 'RB'), ('lazy', 'JJ')]), Tree('ADJPH', [('very', 'RB'), ('uncomfortable', 'JJ')]), Tree('ADJPH', [('very', 'RB'), ('serious', 'JJ')]), Tree('ADJPH', [('so', 'RB'), ('much', 'JJ')]),

In [11]:
#%%

# Now we look for "adverb phrases" or chunks that have 2 consecutive adverbs ('RB')
# First step: writing a grammar that defines POS rules of the adverb phrase the chunk
# we name this grammar "ADVPH" ("ADVerb PHrase")
grammar_advph = "ADVPH: {<RB>+<RB>}"

# Second step: import the nltk parser to process each sentence
chunk_parser_adv = nltk.RegexpParser(grammar_advph)

advph_tags = []
for sent in taggedtext:
    if len(sent) > 0:
        tree = chunk_parser_adv.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'ADVPH':
                advph_tags.append(subtree)

In [12]:
# Visualizing the actual adjective phrase
adverb_phrases = []
for sent in advph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    adverb_phrases.append(temp)
    
print('First 10 adverb phrases: ', adverb_phrases[:10])


First 10 adverb phrases:  ['very hard ', 'farther ahead ', 'very badly ', 'Just then ', 'very carefully ', 'rather not ', 'away so ', 'pretty quietly ', "'Hardly ever ", 'yellow almost ']


In [13]:
# top 50 adjective phrases
freq_advph = nltk.FreqDist(adverb_phrases)

print('Top adverb phrases by frequency: ')
for word, freq in freq_advph.most_common(50):
    print(word, freq)


Top adverb phrases by frequency: 
'All right  10
n't ever  6
n't really  3
down beside  3
as well  2
not quite  2
as far  2
here now  2
n't much  2
n't so  2
up again  2
As soon  2
back there  2
far down  2
very hard  1
farther ahead  1
very badly  1
Just then  1
very carefully  1
rather not  1
away so  1
pretty quietly  1
'Hardly ever  1
yellow almost  1
once again  1
far behind  1
too late  1
'As long  1
so far away  1
n't drunk  1
n't there  1
n't even  1
'So long  1
sore as  1
'Were n't  1
still quite  1
Outside now  1
no longer so  1
not even very  1
very quietly  1
back again  1
ahead brilliantly  1
carefully away  1
probably not  1
absolutely unexpectedly  1
back much too  1
back so  1
so long back  1
away only  1
not very  1


In [14]:
#print the list of our sentences:
print('Length of adverb phrase sentences: ', len(advph_tags))

Length of adverb phrase sentences:  141


In [15]:
#%%

# Top 50 adjective tokens

adjective_tokens = []
for sentence in taggedtext:
    for word, pos in sentence:
        if pos in ['JJ', 'JJR', 'JJS']: # adjective, comparative, superlative
            if len(word)>1:
                adjective_tokens.append(word)
freq_adjective = nltk.FreqDist(adjective_tokens)

print('Top adjective tokens by frequency: ')
for word, freq in freq_adjective.most_common(50):
    print(word,freq)
    
print(len(adjective_tokens))

Top adjective tokens by frequency: 
old 90
big 75
good 59
other 44
little 42
current 37
long 28
hot 27
black 22
more 21
right 21
heavy 20
first 18
deep 17
white 16
high 16
open 15
much 14
great 14
young 12
Indian 12
hard 12
many 12
full 11
same 11
left 10
solid 10
happy 10
dark 10
dead 10
next 10
sick 9
bad 9
German 9
better 9
clear 9
funny 9
last 9
smaller 9
quiet 8
easy 8
fat 8
yellow 8
sweet 8
crazy 8
fine 8
smooth 8
fast 8
net 8
whole 7
1745


In [16]:
#%%

# Top 50 adverb tokens

adverb_tokens = []
for sentence in taggedtext:
    for word, pos in sentence:
        if pos in ['RB', 'RBR', 'RBS']: # adverb, comparative, superlative
            if len(word)>1:
                adverb_tokens.append(word)
freq_adverb = nltk.FreqDist(adverb_tokens)

print('Top adverb tokens by frequency: ')
for word, freq in freq_adverb.most_common(50):
    print(word,freq)

print(len(adverb_tokens))
#%%

## TO DO / YOUR TURN NOW!
## NOUN EXTRACTION
## VERB EXTRACTION
## REMEMBER TO CHECK THE PENN POS TAGS LIST: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
## TO FIND ALL TAGS

Top adverb tokens by frequency: 
n't 209
not 138
back 90
then 83
down 60
up 54
just 50
too 43
very 41
Then 41
away 40
so 38
again 36
always 34
never 34
right 33
there 33
now 32
ever 31
only 25
really 23
here 17
once 17
ahead 16
far 16
along 15
around 15
Now 15
still 14
together 14
quite 11
more 11
'All 10
out 10
forward 10
hard 9
carefully 9
first 9
as 9
well 9
about 9
over 9
maybe 9
slowly 8
sometimes 7
almost 7
even 7
long 7
much 7
later 6
1895


In [17]:
#%%

# Now we have two lists of POS tags combinations we can compare
# We need to get the sentences back from the tagging exercise and run some stats

# Create a list of original sentences from the ADJECTIVE phrase subset:
adjph_whole_sentences = []

# loop over the sentences in the adjective phrase sentences we created:
for sents in adjph_tags:
    temp=''
    for (word,tag) in sents:
        temp += word+' '
        adjph_whole_sentences.append(temp)
        
print(len(adjph_whole_sentences))
print(adjph_whole_sentences)

343
['so ', 'so soused ', 'very ', 'very sick ', 'very ', 'very big ', 'very ', 'very bad ', 'not ', 'not important ', 'not ', 'not important ', 'very ', 'very pale ', 'away ', 'away wet ', 'terribly ', 'terribly sorry ', 'very ', 'very exceptional ', "'Do ", "'Do many ", 'very ', 'very many ', "'Do ", "'Do many ", 'quite ', 'quite sure ', 'not ', 'not worth ', 'very ', 'very lazy ', 'very ', 'very uncomfortable ', 'very ', 'very serious ', 'so ', 'so much ', 'awfully ', 'awfully surprised ', "n't ", "n't striking ", 'about ', 'about right ', 'not ', 'not quite ', 'not quite dark ', 'afrightfully ', 'afrightfully hot ', 'absolutely ', 'absolutely perfect ', 'simply ', 'simply priceless ', 'Too ', 'Too heavy ', 'absolutely ', 'absolutely perfect ', 'very ', 'very jine ', 'sometimes ', 'sometimes slept ', 'pretty ', 'pretty good ', "'How ", "'How much ", "n't ", "n't practical ", 'consciously ', 'consciously practical ', 'quite ', 'quite proud ', 'thoroughly ', 'thoroughly practical ', '

In [18]:
#%%

# Create a list of original sentences from the ADVERB phrase subset:
advph_whole_sentences = []

# loop over the sentences in the adjective phrase sentences we created:
for sents in advph_tags:
    temp=''
    for (word,tag) in sents:
        temp += word+' '
        advph_whole_sentences.append(temp)
        
print(len(advph_whole_sentences))

291


In [19]:
#%%

# OPTIONAL STEP: Combine lists together to have a single list of adjective/adverb phrases:
# Useful to know which sentences are heavy in qualifiers

# create a new variable to store all adjective phrase sentences
adv_adj_phrase_sentences = adjph_whole_sentences

# iterate over adverb phrase sentences
for sent in advph_whole_sentences:
    # if a sentence is not in the adjective phrases list imported
    if sent not in adv_adj_phrase_sentences:
        # attach that sentence
        adv_adj_phrase_sentences.append(sent)

# print the lenght of the list (i.e. number of sentences with both adjective and adverb phrases)
print(len(adv_adj_phrase_sentences))

471


In [20]:
#%%

# Following our NLTK textbook, Writing Structural Programs chapter
# section on Procedural vs Declarative style (http://www.nltk.org/book_1ed/ch04.html) 

## CORPUS STATISTICS--SENTENCES LENGTH

# Calculating the average length of sentences in the entire corpus
# from http://www.nltk.org/book_1ed/ch04.html
total_corpus = sum(len(sent) for sent in textsplit) # remember: 'textsplit' is our text split into sentences
print(total_corpus / len(textsplit))
print(total_corpus)
print(len(textsplit))

59.20254545454546
162807
2750


In [21]:
#%%

# Calculate the average length of an adjective phrase sentence
# We can then compare the average length of the adjective phrases to 
# the average sentences we calculated for all sentences in the corpus
total_adjph_sentences = sum(len(sent) for sent in adjph_whole_sentences) # adjph_whole_sentences stores our adjective phrases
print(total_adjph_sentences / len(adjph_whole_sentences))

print(total_adjph_sentences)
print(len(adjph_whole_sentences))

9.159235668789808
4314
471


In [22]:
# Calculate the average length of an adjective phrase sentence
# We can then compare the average length of the adjective phrases to 
# the average sentences we calculated for all sentences in the corpus
total_advph_sentences = sum(len(sent) for sent in advph_whole_sentences) # adjph_whole_sentences stores our adjective phrases
print(total_advph_sentences / len(advph_whole_sentences))

#%%
print(total_advph_sentences)
print(len(advph_whole_sentences))

7.920962199312715
2305
291
