# Exercises

## Exercise 6.1

Create a list containing the unique adjectives that are occur in *Pride and Prejudice*. 

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
import re
from os.path import join

path = join('Corpus','PrideAndPrejudice.txt') 
with open(path, encoding = 'utf-8') as fh:
    full_text = fh.read()

adjectives = []

## this code reuses variable full_text created in exercise 6.2
sentences = sent_tokenize(full_text)

for s in sentences:
    words = word_tokenize(s)
    tags = nltk.pos_tag(words)
    for i in tags:
        if i[1].startswith('JJ'):

            adjectives.append(i[0])

adjectives = remove_punctuation(adjectives)

for adj in sorted( set(adjectives) ):
    print(adj)

## Exercise 6.2

Stephen King is [reputed to have said](https://www.goodreads.com/quotes/430289-i-believe-the-road-to-hell-is-paved-with-adverbs) that “the road to hell is paved with adverbs", and many style guides similarly give writers the advice to avoid adverbs, especially those ending in '-ly'. 

Can you calculate, for each text in the corpus, the number of adverb ending in '-ly', measured as a percentage of the total number of words?

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
import re
from os.path import join
from tdmh import *

dir = 'Corpus'
scores = dict()

for text in os.listdir(dir):

    path = join('Corpus', text ) 
    with open(path, encoding = 'utf-8') as fh:
        full_text = fh.read()

    nr_words = 0
    ly_adverbs = []

    ## this code reuses variable full_text created in exercise 6.2
    sentences = sent_tokenize(full_text)

    for s in sentences:
        words = word_tokenize(s)
        nr_words += len(words)
        
        tags = nltk.pos_tag(words)
        for i in tags:
            if i[1].startswith('RB') and re.search( r'ly$' ,  i[0]):
                ly_adverbs.append(i[0])
    
    ratio = len(ly_adverbs) / nr_words
    scores[text] = ratio
    
for text in sortedByValue(scores , ascending = False):
    print( f'{text}: {round( scores[text] *100 , 3) }% of the words are adverbs ending in -ly' )
    



# Execise 6.3

Which text in the corpus has the highest number of modal verbs? The Penn Treebank code for 'modal auxialiaries' is MD. 

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
import re
from os.path import join
from tdmh import *

dir = 'Corpus'
scores = dict()

for text in os.listdir(dir):

    path = join('Corpus', text ) 
    with open(path, encoding = 'utf-8') as fh:
        full_text = fh.read()

    nr_words = 0
    modal = []

    ## this code reuses variable full_text created in exercise 6.2
    sentences = sent_tokenize(full_text)

    for s in sentences:
        words = word_tokenize(s)
        nr_words += len(words)
        
        tags = nltk.pos_tag(words)
        for i in tags:
            if i[1].startswith('MD'):
                modal.append(i[0])
    
    ratio = len(modal) / nr_words
    scores[text] = ratio
    
for text in sortedByValue(scores , ascending = False):
    print( f'{text}: {round( scores[text] *100 , 3) }% of the words are modal verbs.' )


# Exercise 6.4

Extract all the sentences from *HeartOfDarkness.txt* that contain an adjective in the superlative form.  Write these sentences into a file named 'sentences.txt'. The code for the words in these category is 'JJS'.

In [None]:
from os.path import join 

path = join( 'Corpus' , 'HeartOfDarkness.txt' )
with open( path , encoding = 'utf-8') as fh:
    full_text = fh.read()
    
sentences = sent_tokenize(full_text)

out = open('sentences.txt' , 'w' , encoding = 'utf-8')

for s in sentences:
    words = word_tokenize(s)
    tags = nltk.pos_tag(words)
    for i in tags:
        if i[1].startswith('JJS'):
            out.write(f'{s}\n')
            break

out.close()
    

## Exercise 6.5

Extract all the sentences from *ARoomWithaView.txt* containing a form of the verb 'to see', in all tenses and conjugations and excepting the infitive form. In other words, extract sentences containing forms such as 'seen', 'saw' or 'seeing', but not 'see'. 


In [None]:
import nltk
from nltk import word_tokenize , sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
from tdm import *


from os.path import join 

path = join( 'Corpus' , 'ARoomWithaView.txt' )
with open( path , encoding = 'utf-8') as fh:
    full_text = fh.read()

    
lemmatiser = WordNetLemmatizer()
sentences = sent_tokenize(full_text)


for sent in sentences:
    words = word_tokenize(sent)
    pos = nltk.pos_tag(words)
    
    for i,w in enumerate(words):

        posTag = ptb_to_wordnet( pos[i][1] )

        if re.search( r'\w+' , posTag , re.IGNORECASE ):
            lemma = lemmatiser.lemmatize( words[i] , posTag )
            #print(lemma)
            if lemma == 'see' and words[i] != 'see':
                print( f'{sent}\n' )
                break
            

## Exercise 6.6

From *HeartOfDarkness.txt* , extract all sentnces containing the following combinations of categories: 

* Article - adverb - adjective - noun 

These categorties can be asigned the following codes:

* Article: DT
* Adverb: RB, RBR or RBS
* Adjective: JJ, JJR or JJS
* Noun: NN, NNP, NNPS or NNS



In [None]:
import nltk
from nltk import word_tokenize , sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
from tdm import *


from os.path import join 

path = join( 'Corpus' , 'ARoomWithaView.txt' )
with open( path , encoding = 'utf-8') as fh:
    full_text = fh.read()
    
sentences = sent_tokenize(full_text)

for sent in sentences:
    words = word_tokenize(sent)
    words = remove_punctuation(words)
    pos = pos_tag(words)
    
    tagged_sentence = ''

    for p in pos:
        tagged_sentence += p[1] + ' '

    if re.search( r'DT RB\w? JJ\w? NN\w?' , tagged_sentence):
        print(sent)
        