# Aula 2

## Tokenização e N-grama

In [1]:
import pandas as pd

df = pd.DataFrame({
    'text': [
      'Eu gosto de assistir jogos de futebol',
      'Já eu, prefiro assistir jogos de basquete'
    ]
    })

df.head()

Unnamed: 0,text
0,Eu gosto de assistir jogos de futebol
1,"Já eu, prefiro assistir jogos de basquete"


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(1,1))
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

          0  1
assistir  1  1
basquete  0  1
de        2  1
eu        1  1
futebol   1  0
gosto     1  0
jogos     1  1
já        0  1
prefiro   0  1


In [3]:
text_vect.A

array([[1, 0, 2, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 0, 0, 1, 1, 1]], dtype=int64)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(2,2))
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

                  0  1
assistir jogos    1  1
de assistir       1  0
de basquete       0  1
de futebol        1  0
eu gosto          1  0
eu prefiro        0  1
gosto de          1  0
jogos de          1  1
já eu             0  1
prefiro assistir  0  1


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(3,3))
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

                        0  1
assistir jogos de       1  1
de assistir jogos       1  0
eu gosto de             1  0
eu prefiro assistir     0  1
gosto de assistir       1  0
jogos de basquete       0  1
jogos de futebol        1  0
já eu prefiro           0  1
prefiro assistir jogos  0  1


## Tokenização

In [6]:
!pip install -U nltk 

Requirement already up-to-date: nltk in c:\programdata\anaconda3\lib\site-packages (3.4.1)


In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\logonrmlocal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from nltk.tokenize import word_tokenize
exemplo = 'O futebol brasileiro é o melhor do mundo. Você concorda?'
words = word_tokenize(exemplo)
words

['O',
 'futebol',
 'brasileiro',
 'é',
 'o',
 'melhor',
 'do',
 'mundo',
 '.',
 'Você',
 'concorda',
 '?']

## Regex

In [9]:
import re
regex = r"(?<=@)[^.]+(?=\.)"
re.findall(regex, "dhenyt@gmail.com")

['gmail']

In [10]:
s = "dhenyt@gmail.com"
s.split("@")[1].split(".")[0]

'gmail'

## Stop-words

In [11]:
import nltk
nltk.download('stopwords')
nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\logonrmlocal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['de',
 'a',
 'o',
 'que',
 'e',
 'do',
 'da',
 'em',
 'um',
 'para',
 'com',
 'não',
 'uma',
 'os',
 'no',
 'se',
 'na',
 'por',
 'mais',
 'as',
 'dos',
 'como',
 'mas',
 'ao',
 'ele',
 'das',
 'à',
 'seu',
 'sua',
 'ou',
 'quando',
 'muito',
 'nos',
 'já',
 'eu',
 'também',
 'só',
 'pelo',
 'pela',
 'até',
 'isso',
 'ela',
 'entre',
 'depois',
 'sem',
 'mesmo',
 'aos',
 'seus',
 'quem',
 'nas',
 'me',
 'esse',
 'eles',
 'você',
 'essa',
 'num',
 'nem',
 'suas',
 'meu',
 'às',
 'minha',
 'numa',
 'pelos',
 'elas',
 'qual',
 'nós',
 'lhe',
 'deles',
 'essas',
 'esses',
 'pelas',
 'este',
 'dele',
 'tu',
 'te',
 'vocês',
 'vos',
 'lhes',
 'meus',
 'minhas',
 'teu',
 'tua',
 'teus',
 'tuas',
 'nosso',
 'nossa',
 'nossos',
 'nossas',
 'dela',
 'delas',
 'esta',
 'estes',
 'estas',
 'aquele',
 'aquela',
 'aqueles',
 'aquelas',
 'isto',
 'aquilo',
 'estou',
 'está',
 'estamos',
 'estão',
 'estive',
 'esteve',
 'estivemos',
 'estiveram',
 'estava',
 'estávamos',
 'estavam',
 'estivera',
 'es

## Normalização de Texto

In [12]:
ex1 = 'O carro que estava quebrado voltou a funcionar'
ex2 = 'Meu carro quebrou e não está funcionando'

print(word_tokenize(ex1))
print(word_tokenize(ex2))

['O', 'carro', 'que', 'estava', 'quebrado', 'voltou', 'a', 'funcionar']
['Meu', 'carro', 'quebrou', 'e', 'não', 'está', 'funcionando']


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

df = pd.DataFrame({'text':[ex1,ex2]})

vect = CountVectorizer(ngram_range=(1,1))
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

             0  1
carro        1  1
estava       1  0
está         0  1
funcionando  0  1
funcionar    1  0
meu          0  1
não          0  1
que          1  0
quebrado     1  0
quebrou      0  1
voltou       1  0


## Stop-words

In [14]:
stops = nltk.corpus.stopwords.words('portuguese')

    vect = CountVectorizer(ngram_range=(1,1), stop_words=stops)
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

             0  1
carro        1  1
funcionando  0  1
funcionar    1  0
quebrado     1  0
quebrou      0  1
voltou       1  0


In [15]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
examples = [
   "go","going",
   "goes","gone","went"
]

wnl = WordNetLemmatizer()

for word in examples:
  print(wnl.lemmatize(word, 'v'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\logonrmlocal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


go
go
go
go
go


In [16]:
!pip install spacy



In [17]:
!python -m spacy download pt_core_news_sm

[+] Download and installation successful
You can now load the model via spacy.load('pt_core_news_sm')


In [18]:
import spacy

nlp = spacy.load('pt_core_news_sm')
doc = nlp(u'quebrou,quebraram,quebrado,quebrariam')
[token.lemma_ for token in doc if token.pos_ == 'VERB']

['quebrar', 'quebrar', 'quebrar', 'quebrar']

In [19]:
import nltk
from nltk.stem import PorterStemmer

examples = [
    "connection","connections",
    "connective","connecting","connected"
]

ps = PorterStemmer()

for word in examples:
  print(ps.stem(word))

connect
connect
connect
connect
connect


In [20]:
from nltk.stem import PorterStemmer

examples = ["conecta","conectado","conectamos","desconectados","conectividade"]

ps = PorterStemmer()

for word in examples:
  print(ps.stem(word))

conecta
conectado
conectamo
desconectado
conectividad


In [23]:
import nltk
nltk.download('rslp')

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\logonrmlocal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping stemmers\rslp.zip.


True

In [24]:
from nltk.stem.rslp import RSLPStemmer

examples = ["conecta","conectado","conectamos","desconectados","conectividade"]

rslp = RSLPStemmer()

for word in examples:
  print(rslp.stem(word))

conect
conect
conect
desconect
conect


In [25]:
stem1 = " ".join([rslp.stem(x) for x in word_tokenize(ex1)])
stem2 = " ".join([rslp.stem(x) for x in word_tokenize(ex2)])

In [26]:
df = pd.DataFrame({'text':[stem1,stem2]})
stops = nltk.corpus.stopwords.words('portuguese')

vect = CountVectorizer(ngram_range=(1,1), stop_words=stops)
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

         0  1
carr     1  1
est      1  1
funcion  1  1
quebr    1  1
volt     1  0


## POS-Tagger

In [27]:
import nltk
import pandas as pd
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\logonrmlocal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [28]:
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [29]:
text1 = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text1)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [30]:
nltk.download('brown')
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\logonrmlocal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


man time day year car moment world house family child country boy
state job place way war girl work word


In [31]:
text.similar('bought')

made said done put had seen found given left heard was been brought
set got that took in told felt


In [32]:
text.similar('over')

in on to of and for with from at by that into as up out down through
is all about


In [33]:
text.similar('the')

a his this their its her an that our any all one these my in your no
some other and


In [34]:
nltk.download('floresta')
from nltk.corpus import floresta
floresta.tagged_words()

[nltk_data] Downloading package floresta to
[nltk_data]     C:\Users\logonrmlocal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\floresta.zip.


[('Um', '>N+art'), ('revivalismo', 'H+n'), ...]

In [35]:
def simplify_tag(t):
  if "+" in t:
    return t.split("+")[1]
  return t 

twords = nltk.corpus.floresta.tagged_words()
twords = [(w.lower(),simplify_tag(t)) for (w,t) in twords]
twords[:10]

[('um', 'art'),
 ('revivalismo', 'n'),
 ('refrescante', 'adj'),
 ('o', 'art'),
 ('7_e_meio', 'prop'),
 ('é', 'v-fin'),
 ('um', 'art'),
 ('ex-libris', 'n'),
 ('de', 'prp'),
 ('a', 'art')]

In [36]:
print(nltk.corpus.floresta.readme())

Portuguese Treebank

Projecto Floresta Sinta(c)tica -- http://www.linguateca.pt/Floresta/
Version 7.4  Distributed with permission.

Penn Treebank format, available from http://linguateca.di.uminho.pt/FS/fs.html

Key to tags (http://visl.sdu.dk/visl/pt/portsymbol.html)

<ACC          direct object
<ACC-PASS     passive use of pronoun 'se'
<ADVS, <ADVO  adverbial argument
<ADVL         adjunct adverbial
<DAT          dative (indirect) object
<FOC          focus marker (or right focus bracket)
<OC           object complement
<PASS         agent of passive
<PIV          prepositional object
<PRED         free (subject) predicative, right of main verb
<SC           subject complement
<SUBJ         subject
>A            adverbial pre-adject (intensifier before adjective, adverb, pronoun or participle)
>N            prenominal modifier
>P            modifier of prepositional phrase (intensifier, operator or focus adverb)
>S            modifier of clause (intensifier, operator or focus adverb

### Default Tagger

In [37]:
tags = [tag for (word, tag) in twords]
nltk.FreqDist(tags).max()

'n'

In [38]:
raw = 'Esse é um exemplo utilizando o marcador padrão'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('n')
default_tagger.tag(tokens)

[('Esse', 'n'),
 ('é', 'n'),
 ('um', 'n'),
 ('exemplo', 'n'),
 ('utilizando', 'n'),
 ('o', 'n'),
 ('marcador', 'n'),
 ('padrão', 'n')]

In [39]:
tsents = floresta.tagged_sents()
tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
train = tsents[1000:]
test = tsents[:1000]
tsents[1:3]

[[('o', 'art'),
  ('7_e_meio', 'prop'),
  ('é', 'v-fin'),
  ('um', 'art'),
  ('ex-libris', 'n'),
  ('de', 'prp'),
  ('a', 'art'),
  ('noite', 'n'),
  ('algarvia', 'adj'),
  ('.', '.')],
 [('é', 'v-fin'),
  ('uma', 'num'),
  ('de', 'prp'),
  ('as', 'art'),
  ('mais', 'adv'),
  ('antigas', 'adj'),
  ('discotecas', 'n'),
  ('de', 'prp'),
  ('o', 'art'),
  ('algarve', 'prop'),
  (',', ','),
  ('situada', 'v-pcp'),
  ('em', 'prp'),
  ('albufeira', 'prop'),
  (',', ','),
  ('que', 'pron-indp'),
  ('continua', 'v-fin'),
  ('a', 'prp'),
  ('manter', 'v-inf'),
  ('os', 'art'),
  ('traços', 'n'),
  ('decorativos', 'adj'),
  ('e', 'conj-c'),
  ('as', 'art'),
  ('clientelas', 'n'),
  ('de', 'prp'),
  ('sempre', 'adv'),
  ('.', '.')]]

In [40]:
tsents[1:3]

[[('o', 'art'),
  ('7_e_meio', 'prop'),
  ('é', 'v-fin'),
  ('um', 'art'),
  ('ex-libris', 'n'),
  ('de', 'prp'),
  ('a', 'art'),
  ('noite', 'n'),
  ('algarvia', 'adj'),
  ('.', '.')],
 [('é', 'v-fin'),
  ('uma', 'num'),
  ('de', 'prp'),
  ('as', 'art'),
  ('mais', 'adv'),
  ('antigas', 'adj'),
  ('discotecas', 'n'),
  ('de', 'prp'),
  ('o', 'art'),
  ('algarve', 'prop'),
  (',', ','),
  ('situada', 'v-pcp'),
  ('em', 'prp'),
  ('albufeira', 'prop'),
  (',', ','),
  ('que', 'pron-indp'),
  ('continua', 'v-fin'),
  ('a', 'prp'),
  ('manter', 'v-inf'),
  ('os', 'art'),
  ('traços', 'n'),
  ('decorativos', 'adj'),
  ('e', 'conj-c'),
  ('as', 'art'),
  ('clientelas', 'n'),
  ('de', 'prp'),
  ('sempre', 'adv'),
  ('.', '.')]]

In [41]:
tagger0 = nltk.DefaultTagger('n')
print(tagger0.evaluate(test))

0.17800040072129833


### Unigram Tagger

In [42]:
tagger1 = nltk.UnigramTagger(train)
print(tagger1.evaluate(test))

0.8522139851733119


### Bigram Tagger

In [43]:
tagger2 = nltk.BigramTagger(train)
print(tagger2.evaluate(test))

0.14626327389300742


### Combinação de Tagger

In [44]:
tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
print('tagger1: ',tagger1.evaluate(test))
tagger2 = nltk.BigramTagger(train, backoff=tagger1)
print('tagger2: ',tagger2.evaluate(test))

tagger1:  0.8740532959326788
tagger2:  0.8900420757363254


### Salvando Tagger

In [45]:
!pip install pickle
from pickle import dump
output = open('tagger.pkl', 'wb')
dump(tagger2, output, -1)
output.close()

Collecting pickle


  Could not find a version that satisfies the requirement pickle (from versions: )
No matching distribution found for pickle


In [48]:
from pickle import load
input = open('tagger.pkl', 'rb')
tagger = load(input)
input.close()

In [49]:
text1 = "Isso é para você."
text2 = "para com isso"
tokens1 = text1.split()
tokens2 = text2.split()
print('text1: ',tagger.tag(tokens1))
print('text2: ',tagger.tag(tokens2))

text1:  [('Isso', 'n'), ('é', 'v-fin'), ('para', 'prp'), ('você.', 'n')]
text2:  [('para', 'prp'), ('com', 'prp'), ('isso', 'pron-indp')]


## TextBlob

In [50]:
!pip install textblob

Collecting textblob
  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [63]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

nltk.download('movie_reviews')
opinion = TextBlob("This movie was horrible!", analyzer=NaiveBayesAnalyzer())
opinion.sentiment

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\logonrmlocal\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Sentiment(classification='neg', p_pos=0.3205526272161921, p_neg=0.6794473727838077)

In [59]:
text = 'O estudo de processamento de linguagem natural é um dos campos mais promissores em inteligência artificial'
print("Frase original: ", text)

tblob = TextBlob(text)

print('Idioma: ',tblob.detect_language())

en_tblob = tblob.translate(to='en')

print("Traduzido: ", en_tblob)

print("Tags: ", en_tblob.tags)

Frase original:  O estudo de processamento de linguagem natural é um dos campos mais promissores em inteligência artificial
Idioma:  pt
Traduzido:  The study of natural language processing is one of the most promising fields in artificial intelligence
Tags:  [('The', 'DT'), ('study', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('most', 'RBS'), ('promising', 'JJ'), ('fields', 'NNS'), ('in', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN')]


## SpaCy

In [53]:
import spacy
nlp = spacy.load('pt_core_news_sm')
doc = nlp(u'Ayrton Senna foi o melhor piloto de Fórmula 1 que já existiu')
print([token.orth_ for token in doc])

['Ayrton', 'Senna', 'foi', 'o', 'melhor', 'piloto', 'de', 'Fórmula', '1', 'que', 'já', 'existiu']


In [54]:
[(token.orth_, token.pos_) for token in doc]

[('Ayrton', 'PROPN'),
 ('Senna', 'PROPN'),
 ('foi', 'VERB'),
 ('o', 'DET'),
 ('melhor', 'ADJ'),
 ('piloto', 'NOUN'),
 ('de', 'ADP'),
 ('Fórmula', 'PROPN'),
 ('1', 'PROPN'),
 ('que', 'PRON'),
 ('já', 'ADV'),
 ('existiu', 'VERB')]

In [68]:
print('filtrando apenas verbos: ')
print([token.lemma_ for token in doc if token.pos_ == 'VERB'])
print('identificação de entidades: ')
doc1 = nlp(u'Machado de Assis um dos melhores escritores do Brasil, \
foi o primeiro presidente da Academia Brasileira de Letras')
print(doc1.ents)

filtrando apenas verbos: 
['ser', 'existir']
identificação de entidades: 
(Machado de Assis, Brasil, Academia Brasileira de Letras)
