# 11. Part of Speech Tagging and Lemmatisation

### Exercise 11.1

Create a list containing the unique adjectives that are occur in *Pride and Prejudice*. 

In [None]:
import nltk
from nltk import word_tokenize,pos_tag
from text_mining import *
from collections import Counter

path = os.path.join('..', 'Corpus','PrideAndPrejudice.txt')

with open( path , encoding = 'utf-8') as file:
    full_text = file.read()

words = word_tokenize(full_text)
words = remove_punctuation(words)
pos = pos_tag(words)

adjectives = []
adj_codes = ['JJ','JJR','JJS']

for p in pos:
    if p[1] in adj_codes:
        adjectives.append(p[0])
        
freq = Counter(adjectives)

for word,count in freq.most_common(20):
    print(f'{word} => {count}')
 

### Exercise 11.2

Stephen King is [reputed to have said](https://www.goodreads.com/quotes/430289-i-believe-the-road-to-hell-is-paved-with-adverbs) that “the road to hell is paved with adverbs", and many style guides similarly give writers the advice to avoid adverbs, especially those ending in '-ly'. 

Can you calculate, for each text in the corpus, the number of adverb ending in '-ly', measured as a percentage of the total number of words?

In [None]:
import nltk
from nltk import word_tokenize,pos_tag
from text_mining import *
from collections import Counter

directory = os.path.join('..','Corpus')
files = os.listdir(directory)

for file in files:
    print(f"\n{file}")
    path = os.path.join(directory,file)
    
    full_text = ''
    with open( path , encoding = 'utf-8') as file:
        full_text = file.read()

    words = word_tokenize(full_text.lower())
    words = remove_punctuation(words)
    nr_words = len(words)
    pos = pos_tag(words)

    adjectives = []
    adj_codes = ['RB','RBR','RBS']

    ly_adverbs = 0
    for p in pos:
        if p[1] in adj_codes and p[0][-2:].strip() == 'ly':
            adjectives.append(p[0])
            ly_adverbs += 1

    freq = Counter(adjectives)
        
    print(f"{ly_adverbs} adverbs ending in '-ly' in total.")
    print(f"This is {round(ly_adverbs/nr_words,4)}% of all the words ")
    
    number = 15
    if ly_adverbs>0:
        print(f"{number} most frequent adverbs:")
        for word,count in freq.most_common(number):
            print(f'{word} => {count}')

### Exercise 11.3

Which text in the corpus has the highest number of modal verbs? The Penn Treebank code for 'modal auxialiaries' is MD. 

In [None]:
import nltk
from nltk import word_tokenize,pos_tag
from text_mining import *
from collections import Counter

directory = os.path.join('..','Corpus')
files = os.listdir(directory)

for file in files:
    print(f"\n{file}")
    path = os.path.join(directory,file)
    
    full_text = ''
    with open( path , encoding = 'utf-8') as file:
        full_text = file.read()

    words = word_tokenize(full_text.lower())
    words = remove_punctuation(words)
    nr_words = len(words)
    pos = pos_tag(words)
    
    modal_verbs = []

    for p in pos:
        if p[1] == 'MD' and len(p[0])>2:
            modal_verbs.append(p[0])

    freq = Counter(modal_verbs)
        
    print(f"{len(modal_verbs)} modal verbs.")

    number = 10
    if len(modal_verbs)>0:
        print(f"{number} most frequent modal verbs:")
        for word,count in freq.most_common(number):
            print(f'{word} => {count}')

### Exercise 11.4

Extract all the sentences from *BraveNewWorld.txt* that contain an adjective in the superlative form.  Write these sentences into a file named 'sentences.txt'. The code for the words in these category is 'JJS'.

In [None]:
import nltk
from nltk import word_tokenize,pos_tag
from text_mining import *
from collections import Counter

path = os.path.join('..', 'Corpus','BraveNewWorld.txt')

with open( path , encoding = 'utf-8') as file:
    full_text = file.read()

sentences = sent_tokenize(full_text)

for sentence in sentences:
    words = word_tokenize(sentence)
    words = remove_punctuation(words)
    pos = pos_tag(words)
    
    adj = []
    for p in pos:
        if p[1] == 'JJS':
            adj.append(p[0])
            
    if len(adj)>0:
        print(f"{sentence} [{'|'.join(adj)}]")


### Exercise11.5

Extract all the sentences from *Ullyses.txt* containing a form of the verb 'to see', in all tenses and conjugations and excepting the infitive form. In other words, extract sentences containing forms such as 'seen', 'saw' or 'seeing', but not 'see'. 


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
import re
from text_mining import *
lemmatiser = WordNetLemmatizer()


path = os.path.join('..', 'Corpus','Ullyses.txt')

full_text = ''
with open( path , encoding = 'utf-8') as file:
    full_text = file.read()

sentences = sent_tokenize(full_text)

for sentence in sentences:

    words = word_tokenize(sentence.lower())
    words = remove_punctuation(words)

    pos = nltk.pos_tag(words)
    
    hits = []

    for i,word in enumerate(words):
        word = word.lower()
        posTag = ptb_to_wordnet( pos[i][1] )

        if re.search( r'\w+' , posTag , re.IGNORECASE ):
            lemma = lemmatiser.lemmatize( words[i] , posTag )
            if lemma == 'see':
                hits.append(word)
        else:
            if word == 'see':
                hits.append(word)
                
    if len(hits)>0:
        print(f"{sentence}\n---")
        

### Exercise 11.6

From *Ullyses.txt*, extract all sentences containing the following combinations of categories: 

* Article - adverb - adjective - noun 

These categories can be assigned the following codes:

* Article: DT
* Adverb: RB, RBR or RBS
* Adjective: JJ, JJR or JJS
* Noun: NN, NNP, NNPS or NNS


In [None]:
import nltk
from nltk import word_tokenize , sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
from tdm import *


from os.path import join 

path = join('..', 'Corpus','Ullyses.txt' )
with open( path , encoding = 'utf-8') as fh:
    full_text = fh.read()
    
sentences = sent_tokenize(full_text)

for sentence in sentences:
    sentence = re.sub(r'\n',' ',sentence)
    words = word_tokenize(sentence)
    words = remove_punctuation(words)
    pos = pos_tag(words)
    
    tagged_sentence = ''

    for p in pos:
        tagged_sentence += p[1] + ' '

    if re.search( r'DT RB JJ NN' , tagged_sentence):
        print(f"{sentence}\n---")
        