# IIC-3670 NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- lime 0.2.0.1
- spacy 3.5.1
- gcsfs 2023.3.0
- protobuf 3.20.3


## Vamos a usar los reviews de la Canon G3

In [1]:
from nltk.corpus import product_reviews_1
camera_reviews = product_reviews_1.reviews('Canon_G3.txt')

reviews = []

for review in camera_reviews:
    sentences = []
    for sentence in review.sents():
        text = " ".join(sentence)
        sentences.append(text)
    document = " ".join(sentences)
    reviews.append(document)


## Limpiamos el texto antes de usarlo

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words and len(t) > 2]
        words += tokens
    
    text = ' '.join(words)
    
    return text

In [3]:
corpus = []

for review in reviews:
    document = tokenize(review)
    corpus.append(document)


## Vamos a trabajar con el lexicon VADER (Valence - Arousal - Dominance)

In [4]:
import nltk

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/marcelo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Y vamos a usar el analyzer que mostré en la lámina (compound score)

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create an instance of SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer() # compound in [-1,1]

Ver documentación de vader en: https://www.nltk.org/api/nltk.sentiment.vader.html

## Usamos unos umbrales de compound para anotar los reviews como positivos, negativos o neutrales

In [6]:
label = []
for i in range(len(corpus)):
    if analyzer.polarity_scores(corpus[i])['compound'] > 0.2: 
        label.append('Positive') # positive sentiment
    elif analyzer.polarity_scores(corpus[i])['compound'] < -0.2:
        label.append('Negative') # negative sentiment
    else:
        label.append('Neutral') # neutral sentiment

In [7]:
import pandas as pd

df = pd.DataFrame(corpus, columns = ['review'])
df['polarities'] = label
df

Unnamed: 0,review,polarities
0,recently purchased canon powershot extremely s...,Positive
1,yep first digital camera toy software engineer...,Positive
2,extensive research comparing different megapix...,Positive
3,bought canon month ago say satisfied taken hun...,Positive
4,camera one full day say wonderful photo qualit...,Positive
5,positive slr like programming exposure control...,Positive
6,camera wonderful set feature lcd screen pull r...,Positive
7,recent price drop made best bargain digital ca...,Positive
8,recommend unreservedly powershot potential buy...,Positive
9,else say camera work make photograph work want...,Positive


## Usemos algunas características lingüísticas para analizar el texto

### Usaremos la librería NELA https://pypi.org/project/nela-features/

In [8]:
!pip3 install nela_features



## Vemos primero las características estilísticas

In [23]:
from nela_features.nela_features import NELAFeatureExtractor

review = corpus[0] 

nela = NELAFeatureExtractor()

feature_vector, feature_names = nela.extract_style(review) 


In [24]:
feature_names

['quotes',
 'exclaim',
 'allpunc',
 'allcaps',
 'stops',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNS',
 'NNP',
 'NNPS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'WP$',
 'WRB',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 '$',
 "''",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 '``']

In [25]:
len(feature_vector)

50

## Ahora las de complejidad del texto

In [12]:
feature_vector, feature_names = nela.extract_complexity(review) 

In [13]:
feature_names

['ttr',
 'avg_wordlen',
 'word_count',
 'flesch_kincaid_grade_level',
 'smog_index',
 'coleman_liau_index',
 'lix']

## ... sesgo

In [14]:
feature_vector, feature_names = nela.extract_bias(review)

In [15]:
feature_names

['bias_words',
 'assertatives',
 'factives',
 'hedges',
 'implicatives',
 'report_verbs',
 'positive_opinion_words',
 'negative_opinion_words']

## ... para afecto usa VADER además de otros lexicons

In [16]:
feature_vector, feature_names = nela.extract_affect(review) 

In [17]:
feature_names

['vadneg', 'vadneu', 'vadpos', 'wneg', 'wpos', 'wneu', 'sneg', 'spos', 'sneu']

## ... y algunas features en dimensiones morales

In [18]:
feature_vector, feature_names = nela.extract_moral(review) 

In [19]:
feature_names

['HarmVirtue',
 'HarmVice',
 'FairnessVirtue',
 'FairnessVice',
 'IngroupVirtue',
 'IngroupVice',
 'AuthorityVirtue',
 'AuthorityVice',
 'PurityVirtue',
 'PurityVice',
 'MoralityGeneral']

In [20]:
feature_vector

[0.0,
 0.0,
 0.0,
 0.0,
 0.008264462809917356,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.024793388429752067]

## Podemos extraer todas las características de una vez

In [21]:
# Extract all feature groups at once
feature_vector, feature_names = nela.extract_all(review)

In [22]:
feature_vector

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.01652892561983471,
 0.17355371900826447,
 0.024793388429752067,
 0.008264462809917356,
 0.0,
 0.0,
 0.48760330578512395,
 0.008264462809917356,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.08264462809917356,
 0.0,
 0.0,
 0.008264462809917356,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.01652892561983471,
 0.05785123966942149,
 0.03305785123966942,
 0.024793388429752067,
 0.05785123966942149,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.7603305785123967,
 5.8429752066115705,
 121,
 40.4997,
 33.49590136395381,
 18.3672,
 156.53719008264463,
 0.19008264462809918,
 0.0,
 0.0,
 0.01652892561983471,
 0.01652892561983471,
 0.01652892561983471,
 0.15702479338842976,
 0.01652892561983471,
 0.036,
 0.597,
 0.366,
 0.05785123966942149,
 0.10743801652892562,
 0.024793388429752067,
 0.008264462809917356,
 0.08264462809917356,
 0.01652892561983471,
 0.0,
 0.0,
 0.0,
 0.0,
 0.008264462809917356,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,