# Aula 2

## Tokenização e N-grama

In [None]:
import pandas as pd

df = pd.DataFrame({
    'text': [
      'Eu gosto de assistir jogos de futebol',
      'Já eu, prefiro assistir jogos de basquete'
    ]
    })

df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(1,1))
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

In [None]:
text_vect.A

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(2,2))
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(3,3))
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

## Tokenização

In [None]:
from nltk.tokenize import word_tokenize
exemplo = 'O futebol brasileiro é o melhor do mundo. Você concorda?'
words = word_tokenize(exemplo)
words

## Regex

In [None]:
import re
regex = r"(?<=@)[^.]+(?=\.)"
re.findall(regex, "dhenyt@gmail.com")

In [None]:
s = "dhenyt@gmail.com"
s.split("@")[1].split(".")[0]

## Stop-words

In [None]:
import nltk
nltk.download('stopwords')
nltk.corpus.stopwords.words('portuguese')

## Normalização de Texto

In [None]:
ex1 = 'O carro que estava quebrado voltou a funcionar'
ex2 = 'Meu carro quebrou e não está funcionando'

print(word_tokenize(ex1))
print(word_tokenize(ex2))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

df = pd.DataFrame({'text':[ex1,ex2]})

vect = CountVectorizer(ngram_range=(1,1))
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

## Stop-words

In [None]:
stops = nltk.corpus.stopwords.words('portuguese')

vect = CountVectorizer(ngram_range=(1,1), stop_words=stops)
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
examples = [
   "go","going",
   "goes","gone","went"
]

wnl = WordNetLemmatizer()

for word in examples:
  print(wnl.lemmatize(word, 'v'))

In [None]:
!pip install spacy
!python -m spacy download pt_core_news_sm

In [None]:
import spacy

nlp = spacy.load('pt_core_news_sm')
doc = nlp(u'quebrou,quebraram,quebrado,quebrariam')
[token.lemma_ for token in doc if token.pos_ == 'VERB']

In [None]:
import nltk
from nltk.stem import PorterStemmer

examples = [
    "connection","connections",
    "connective","connecting","connected"
]

ps = PorterStemmer()

for word in examples:
  print(ps.stem(word))

In [None]:
from nltk.stem import PorterStemmer

examples = ["conecta","conectado","conectamos","desconectados","conectividade"]

ps = PorterStemmer()

for word in examples:
  print(ps.stem(word))

In [None]:
from nltk.stem.rslp import RSLPStemmer

examples = ["conecta","conectado","conectamos","desconectados","conectividade"]

rslp = RSLPStemmer()

for word in examples:
  print(rslp.stem(word))

In [None]:
stem1 = " ".join([rslp.stem(x) for x in word_tokenize(ex1)])
stem2 = " ".join([rslp.stem(x) for x in word_tokenize(ex2)])

In [None]:
df = pd.DataFrame({'text':[stem1,stem2]})
stops = nltk.corpus.stopwords.words('portuguese')

vect = CountVectorizer(ngram_range=(1,1), stop_words=stops)
vect.fit(df.text)
text_vect = vect.transform(df.text)

print(pd.DataFrame(text_vect.A, columns=vect.get_feature_names()).T.to_string())

## POS-Tagger

In [None]:
import nltk
import pandas as pd
nltk.download('averaged_perceptron_tagger')

In [None]:
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

In [None]:
text1 = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text1)

In [None]:
nltk.download('brown')
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

In [None]:
text.similar('bought')

In [None]:
text.similar('over')

In [None]:
text.similar('the')

In [None]:
nltk.download('floresta')
from nltk.corpus import floresta
floresta.tagged_words()

In [None]:
def simplify_tag(t):
  if "+" in t:
    return t.split("+")[1]
  return t 

twords = nltk.corpus.floresta.tagged_words()
twords = [(w.lower(),simplify_tag(t)) for (w,t) in twords]
twords[:10]

In [None]:
print(nltk.corpus.floresta.readme())

### Default Tagger

In [None]:
tags = [tag for (word, tag) in twords]
nltk.FreqDist(tags).max()

In [None]:
raw = 'Esse é um exemplo utilizando o marcador padrão'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('n')
default_tagger.tag(tokens)

In [None]:
tsents = floresta.tagged_sents()
tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
train = tsents[1000:]
test = tsents[:1000]
tsents[1:3]

In [None]:
tsents[1:3]

In [None]:
tagger0 = nltk.DefaultTagger('n')
print(tagger0.evaluate(test))

### Unigram Tagger

In [None]:
tagger1 = nltk.UnigramTagger(train)
print(tagger1.evaluate(test))

### Bigram Tagger

In [None]:
tagger2 = nltk.BigramTagger(train)
print(tagger2.evaluate(test))

### Combinação de Tagger

In [None]:
tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
print('tagger1: ',tagger1.evaluate(test))
tagger2 = nltk.BigramTagger(train, backoff=tagger1)
print('tagger2: ',tagger2.evaluate(test))

### Salvando Tagger

In [None]:
!pip install pickle
from pickle import dump
output = open('tagger.pkl', 'wb')
dump(tagger2, output, -1)
output.close()

In [None]:
from pickle import load
input = open('tagger.pkl', 'rb')
tagger = load(input)
input.close()

In [None]:
text1 = "Isso é para você."
text2 = "para com isso"
tokens1 = text1.split()
tokens2 = text2.split()
print('text1: ',tagger.tag(tokens1))
print('text2: ',tagger.tag(tokens2))

## TextBlob

In [None]:
!pip install textblob

In [None]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

nltk.download('movie_reviews')
opinion = TextBlob("This movie was horrible!", analyzer=NaiveBayesAnalyzer())
opinion.sentiment

In [None]:
text = 'O estudo de processamento de linguagem natural é um dos campos mais promissores em inteligência artificial'
print("Frase original: ", text)

tblob = TextBlob(text)

print('Idioma: ',tblob.detect_language())

en_tblob = tblob.translate(to='en')

print("Traduzido: ", en_tblob)

print("Tags: ", en_tblob.tags)

## SpaCy

In [None]:
import spacy
nlp = spacy.load('pt_core_news_sm')
doc = nlp(u'Ayrton Senna foi o melhor piloto de Fórmula 1 que já existiu')
print([token.orth_ for token in doc])

In [None]:
[(token.orth_, token.pos_) for token in doc]

In [None]:
print('filtrando apenas verbos: ')
print([token.lemma_ for token in doc if token.pos_ == 'VERB'])
print('identificação de entidades: ')
doc1 = nlp(u'Machado de Assis um dos melhores escritores do Brasil, \
foi o primeiro presidente da Academia Brasileira de Letras')
print(doc1.ents)