# Lexicones

Se realizaran entrenamiento agregando caracteristicas extraidas de diferentes lexicones disponibles en la web junto con un lexicon creado a partir del set de entrenamiento

In [1]:
import datasets
import re
import pandas as pd
import numpy as np
from time import time
from scipy.sparse import csr_matrix
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

## SentiWordNet

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag, download

In [3]:
download('wordnet')
download('sentiwordnet')
download('omw-1.4')
download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to Wordnet tags
    """
    first_letter = tag[0]
    answers = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
    return answers.get(first_letter)

In [5]:
lemmatizer = WordNetLemmatizer()

def get_sentiment(word, tag):
    """ 
    returns list of pos neg and objective score. But returns empty list if not present in senti wordnet. 
    """
    wn_tag = penn_to_wn(tag)
    valid_wn_tags = (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB)
    if wn_tag not in valid_wn_tags: 
        return (0.0, 0.0, 1.0)

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma: 
        return (0.0, 0.0, 1.0)

    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets: 
        return (0.0, 0.0, 1.0)

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return (swn_synset.pos_score(), swn_synset.neg_score(), swn_synset.obj_score())

In [7]:
text = 'this movie is wonderful to see. yesterday i was bored as fuck'
sentences = sent_tokenize(text)
for sentence in sentences:
    words_data = word_tokenize(sentence)

    pos_val = pos_tag(words_data)
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]

    print(f"pos_val is {pos_val}")
    print(f"senti_val is {senti_val}")

pos_val is [('this', 'DT'), ('movie', 'NN'), ('is', 'VBZ'), ('wonderful', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('.', '.')]
senti_val is [(0.0, 0.0, 1.0), (0.0, 0.0, 1.0), (0.25, 0.125, 0.625), (0.75, 0.0, 0.25), (0.0, 0.0, 1.0), (0.0, 0.0, 1.0), (0.0, 0.0, 1.0)]
pos_val is [('yesterday', 'NN'), ('i', 'NN'), ('was', 'VBD'), ('bored', 'VBN'), ('as', 'IN'), ('fuck', 'NN')]
senti_val is [(0.0, 0.0, 1.0), (0.0, 0.0, 1.0), (0.25, 0.125, 0.625), (0.125, 0.0, 0.875), (0.0, 0.0, 1.0), (0.0, 0.0, 1.0)]
