# TD2 - Machine Learning for NLP
## Michele MOGAVERO

In [28]:
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import treebank

We use the first 3000 tagged sentences of the treebank corpus as the training set to initialize the UnigramTagger class.

In [29]:
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)
review = treebank.sents()[0]
print(review, '\n')
print(tagger.tag(treebank.sents()[0])) # After tagging

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


After showing how the treebank and UnigramTagger works, we use an other approch to differentiate the solutions about tagging.

In [30]:
nltk.download('sentiwordnet')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\Michele\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Michele\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
review="Today I feel so lucky and happy!" # test

In [32]:
token = nltk.word_tokenize(review)
after_tagging = nltk.pos_tag(token)
print (token)
print (after_tagging)

['Today', 'I', 'feel', 'so', 'lucky', 'and', 'happy', '!']
[('Today', 'NN'), ('I', 'PRP'), ('feel', 'VBP'), ('so', 'RB'), ('lucky', 'JJ'), ('and', 'CC'), ('happy', 'JJ'), ('!', '.')]


We only keep 'RB' and 'JJ', which are respectively adverbs and adjectives.

In [33]:
lemmatizer = WordNetLemmatizer()

A function that converts the treebank tags to wordnet, which is more verbose. We also want to keep the NOUN and VERB because in a future development they may also be considered.

In [34]:
def treebank_to_wn(tag):
    """
    Converts from TreeBank to WordNet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

This script takes tagged list, keeps only the ADJ and ADV, lemmatizes the word, and skip it if not found.
After that it finds an average among the words sentiment.

In [35]:
sentiment = 0.0
tokens_count = 0
for word, tag in after_tagging:
    wn_tag = treebank_to_wn(tag) # convert to wn tag
    if wn_tag not in (wn.ADJ, wn.ADV):
        continue

    lemma = lemmatizer.lemmatize(word, pos=wn_tag) # lemmatization
    if not lemma:
        continue

    synsets = wn.synsets(lemma, pos=wn_tag) # if not present in synsets
    if not synsets:
        continue

    synset = synsets[0] # we take the most common sense
    swn_synset = swn.senti_synset(synset.name())
    print(swn_synset)
    sentiment += swn_synset.pos_score() - swn_synset.neg_score()
    tokens_count += 1
score = sentiment/tokens_count
print('sentiment: ', score)

<so.r.01: PosScore=0.0 NegScore=0.0>
<lucky.s.01: PosScore=0.75 NegScore=0.0>
<happy.a.01: PosScore=0.875 NegScore=0.0>
sentiment:  0.5416666666666666


Copy of the whole function:

In [36]:
def eval_sentiment(path):
    sentiment = 0.0
    tokens_count = 0
    lemmatizer = WordNetLemmatizer()
    with open(path, 'r') as f:
        review = f.read()
        token = nltk.word_tokenize(review)
        after_tagging = nltk.pos_tag(token)
        for word, tag in after_tagging:
            wn_tag = treebank_to_wn(tag) # convert to wn tag
            if wn_tag not in (wn.ADJ, wn.ADV):
                continue
            lemma = lemmatizer.lemmatize(word, pos=wn_tag) # lemmatization
            if not lemma:
                continue
            synsets = wn.synsets(lemma, pos=wn_tag) # if not present in synsets
            if not synsets:
                continue
        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        tokens_count += 1

        print(sentiment)

In [37]:
pos_review_path = './txt_sentoken/pos/cv000_29590.txt'
neg_review_path = './txt_sentoken/neg/cv000_29416.txt'

eval_sentiment(pos_review_path)

0.875
