<a href="https://colab.research.google.com/github/merrecalde/taller-cuenca/blob/master/pre_procesamiento.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook: pre-procesamiento.ipynb
#### Se ejemplifican algunas de las tareas de pre-procesamiento descriptas en la clase 2: Pre-proesamiento de textos del curso "Enfoques Clásicos y Neuronales a la Minería de Texto" 

## 1) Partición de texto

In [0]:
import re

raw = """'When I'M a Duchess,' she said to herself, (not in a very 
hopeful tone though), 'I won't have any pepper in my kitchen AT 
ALL. Soup does very well without--Maybe it's always pepper that 
makes people hot-tempered,'"""

print(raw.split()) #usando split de strings como herramienta
print()
print(re.split(r' ', raw)) #con expresiones regulares (ojo) 

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'"]

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', '\nhopeful', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', '\nALL.', 'Soup', 'does', 'very', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', '\nmakes', 'people', "hot-tempered,'"]


In [0]:
print(re.split(r'[ \t\n]+', raw))

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'"]


In [0]:
print(re.split(r'\s+', raw))

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'"]


In [0]:
print(re.split(r'\W+', raw))

['', 'When', 'I', 'M', 'a', 'Duchess', 'she', 'said', 'to', 'herself', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', 'I', 'won', 't', 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', 'Soup', 'does', 'very', 'well', 'without', 'Maybe', 'it', 's', 'always', 'pepper', 'that', 'makes', 'people', 'hot', 'tempered', '']


In [0]:
print(re.findall(r'\w+', raw))

['When', 'I', 'M', 'a', 'Duchess', 'she', 'said', 'to', 'herself', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', 'I', 'won', 't', 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', 'Soup', 'does', 'very', 'well', 'without', 'Maybe', 'it', 's', 'always', 'pepper', 'that', 'makes', 'people', 'hot', 'tempered']


In [0]:
print(re.findall(r'\w+|\S\w*', raw))

["'When", 'I', "'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'I", 'won', "'t", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '-', '-Maybe', 'it', "'s", 'always', 'pepper', 'that', 'makes', 'people', 'hot', '-tempered', ',', "'"]


## 2) Filtrado de palabras
Se ejemplifica en otras notebooks junto con la normalización o la vectorización de documentos




## 3) Normalizaciones de palabras




### 3.1) Lematización y truncado




In [0]:
import spacy
import nltk

# cargar el modelo del lenguaje inglés de spacy
en_nlp = spacy.load('en')
doc = u"I saw there some saws to cut the tree"
# instanciar el "stemmer" de Porter de nltk
stemmer = nltk.stem.PorterStemmer()
# tokenizar documento con spacy
doc_spacy = en_nlp(doc)
# imprimir lemas encontrados por spacy
print("Lematización:")
print([token.lemma_ for token in doc_spacy])
# imprimir tokens obtenidos con el stemmer de Porter
print("Truncado:")
print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

Lematización:
['-PRON-', 'see', 'there', 'some', 'saw', 'to', 'cut', 'the', 'tree']
Truncado:
['i', 'saw', 'there', 'some', 'saw', 'to', 'cut', 'the', 'tree']


## 4) Etiquetado




### 4.1) Etiquetado de las categorías gramaticales (POS tagging) 

In [0]:
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, wordpunct_tokenize
text = "The old building was demolished. Tomorrow, they will begin building a new one"
pos_tag(wordpunct_tokenize(text))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('The', 'DT'),
 ('old', 'JJ'),
 ('building', 'NN'),
 ('was', 'VBD'),
 ('demolished', 'VBN'),
 ('.', '.'),
 ('Tomorrow', 'NNP'),
 (',', ','),
 ('they', 'PRP'),
 ('will', 'MD'),
 ('begin', 'VB'),
 ('building', 'VBG'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('one', 'CD')]

In [0]:
nltk.download('punkt')
from nltk import word_tokenize
text = "The old building was demolished. Tomorrow, they will begin building a new one"
pos_tag(word_tokenize(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[('The', 'DT'),
 ('old', 'JJ'),
 ('building', 'NN'),
 ('was', 'VBD'),
 ('demolished', 'VBN'),
 ('.', '.'),
 ('Tomorrow', 'NNP'),
 (',', ','),
 ('they', 'PRP'),
 ('will', 'MD'),
 ('begin', 'VB'),
 ('building', 'VBG'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('one', 'CD')]

In [0]:
text = "The grand jury commented on a number of other topics."
pos_tag(wordpunct_tokenize(text))

[('The', 'DT'),
 ('grand', 'JJ'),
 ('jury', 'NN'),
 ('commented', 'VBD'),
 ('on', 'IN'),
 ('a', 'DT'),
 ('number', 'NN'),
 ('of', 'IN'),
 ('other', 'JJ'),
 ('topics', 'NNS'),
 ('.', '.')]

### 4.2) Desambiguación del sentido de las palabras (WSD)




In [0]:
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
wn.synsets('bass')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


[Synset('bass.n.01'),
 Synset('bass.n.02'),
 Synset('bass.n.03'),
 Synset('sea_bass.n.01'),
 Synset('freshwater_bass.n.01'),
 Synset('bass.n.06'),
 Synset('bass.n.07'),
 Synset('bass.n.08'),
 Synset('bass.s.01')]

In [0]:
# Natural Language Toolkit: Word Sense Disambiguation Algorithms
#
# Authors: Liling Tan <alvations@gmail.com>,
#          Dmitrijs Milajevs <dimazest@gmail.com>
#
# Copyright (C) 2001-2018 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

from nltk.corpus import wordnet


def lesk(context_sentence, ambiguous_word, pos=None, synsets=None):
    """Return a synset for an ambiguous word in a context.

    :param iter context_sentence: The context sentence where the ambiguous word
         occurs, passed as an iterable of words.
    :param str ambiguous_word: The ambiguous word that requires WSD.
    :param str pos: A specified Part-of-Speech (POS).
    :param iter synsets: Possible synsets of the ambiguous word.
    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.

    This function is an implementation of the original Lesk algorithm (1986) [1].

    Usage example::

        >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n')
        Synset('savings_bank.n.02')

    [1] Lesk, Michael. "Automatic sense disambiguation using machine
    readable dictionaries: how to tell a pine cone from an ice cream
    cone." Proceedings of the 5th Annual International Conference on
    Systems Documentation. ACM, 1986.
    http://dl.acm.org/citation.cfm?id=318728
    """

    context = set(context_sentence)
    if synsets is None:
        synsets = wordnet.synsets(ambiguous_word)

    if pos:
        synsets = [ss for ss in synsets if str(ss.pos()) == pos]

    if not synsets:
        return None

    _, sense = max(
        (len(context.intersection(ss.definition().split())), ss) for ss in synsets
    )

    return sense




In [0]:
lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n')

Synset('savings_bank.n.02')

### 4.3) Reconocimiento de entidades nombradas (NER)

In [0]:
nltk.download('treebank')
sent = nltk.corpus.treebank.tagged_sents()[22]
sent

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


[('The', 'DT'),
 ('U.S.', 'NNP'),
 ('is', 'VBZ'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('few', 'JJ'),
 ('industrialized', 'VBN'),
 ('nations', 'NNS'),
 ('that', 'WDT'),
 ('*T*-7', '-NONE-'),
 ('does', 'VBZ'),
 ("n't", 'RB'),
 ('have', 'VB'),
 ('a', 'DT'),
 ('higher', 'JJR'),
 ('standard', 'NN'),
 ('of', 'IN'),
 ('regulation', 'NN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('smooth', 'JJ'),
 (',', ','),
 ('needle-like', 'JJ'),
 ('fibers', 'NNS'),
 ('such', 'JJ'),
 ('as', 'IN'),
 ('crocidolite', 'NN'),
 ('that', 'WDT'),
 ('*T*-1', '-NONE-'),
 ('are', 'VBP'),
 ('classified', 'VBN'),
 ('*-5', '-NONE-'),
 ('as', 'IN'),
 ('amphobiles', 'NNS'),
 (',', ','),
 ('according', 'VBG'),
 ('to', 'TO'),
 ('Brooke', 'NNP'),
 ('T.', 'NNP'),
 ('Mossman', 'NNP'),
 (',', ','),
 ('a', 'DT'),
 ('professor', 'NN'),
 ('of', 'IN'),
 ('pathlogy', 'NN'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('University', 'NNP'),
 ('of', 'IN'),
 ('Vermont', 'NNP'),
 ('College', 'NNP'),
 ('of', 'IN'),
 ('Medicine', 'NNP'),
 ('.', '

In [0]:
nltk.download('maxent_ne_chunker')
nltk.download('words')
print(nltk.ne_chunk(sent))

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
(S
  The/DT
  (GPE U.S./NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  few/JJ
  industrialized/VBN
  nations/NNS
  that/WDT
  *T*-7/-NONE-
  does/VBZ
  n't/RB
  have/VB
  a/DT
  higher/JJR
  standard/NN
  of/IN
  regulation/NN
  for/IN
  the/DT
  smooth/JJ
  ,/,
  needle-like/JJ
  fibers/NNS
  such/JJ
  as/IN
  crocidolite/NN
  that/WDT
  *T*-1/-NONE-
  are/VBP
  classified/VBN
  *-5/-NONE-
  as/IN
  amphobiles/NNS
  ,/,
  according/VBG
  to/TO
  (PERSON Brooke/NNP T./NNP Mossman/NNP)
  ,/,
  a/DT
  professor/NN
  of/IN
  pathlogy/NN
  at/IN
  the/DT
  (ORGANIZATION University/NNP)
  of/IN
  (PERSON Vermont/NNP College/NNP)
  of/IN
  (GPE Medicine/NNP)
  ./.)
