## Quickstart

In [3]:
import nltk
import re

In [2]:
nltk.download('cess_esp')

[nltk_data] Downloading package cess_esp to
[nltk_data]     /Users/mmenendezg/nltk_data...
[nltk_data]   Unzipping corpora/cess_esp.zip.


True

In [8]:
corpus = nltk.corpus.cess_esp.sents()
print(len(corpus))

6030


In [11]:
flatten = [w for l in corpus for w in l]
print((flatten[:20]))

['El', 'grupo', 'estatal', 'Electricité_de_France', '-Fpa-', 'EDF', '-Fpt-', 'anunció', 'hoy', ',', 'jueves', ',', 'la', 'compra', 'del', '51_por_ciento', 'de', 'la', 'empresa', 'mexicana']


### Structure of method `re.search()`

In [14]:
# Contains `es` in any point
arr = [w for w in flatten if re.search("es", w)]
print(arr[10:20])

['millones', 'dólares', 'es', 'alcaldesa', 'Congreso', 'esta', 'militantes', 'esta', 'eso', 'es']


In [15]:
# Contains `es` at the end of the word
arr = [w for w in flatten if re.search("es$", w)]
print(arr[10:20])

['es', 'militantes', 'populares', 'militantes', 'elecciones', 'generales', 'elecciones', 'generales', 'factores', 'alcaldes']


In [16]:
# Contains `es` at the beginning of the word
arr = [w for w in flatten if re.search("^es", w)]
print(arr[10:20])

['es', 'es', 'este', 'españolas', 'esta', 'este', 'es', 'español', 'es', 'español']


In [17]:
# Range [a-z][ghi]
arr = [w for w in flatten if re.search("^[ghi]", w)]
print(arr[10:20])

['hotel', 'gracias', 'hoy', 'generales', 'hombre', 'ha', 'hasta', 'gracias', 'gente', 'ha']


In [19]:
# Quantifiers
# * -> 0 or more
# + -> 1 or more 
# ? -> 0 or 1
arr = [w for w in flatten if re.search("^(no)+", w)]
print(arr[10:20])

['no_obstante', 'no', 'no', 'no', 'norte', 'no', 'no', 'no', 'no', 'norteamericano']


### Text Normalization

In [21]:
text = "This is \n a test"
print(text)

This is 
 a test


#### Tokenization: It's the process of subdividing a text chain in minimum linguistic units (words)

In [22]:
text_test = """ Cuando sea el rey del mundo (imaginaba él en su cabeza) no tendré que preocuparme por estas bobadas. 
                Era solo un niño de 7 años, pero pensaba que podría ser cualquier cosa que su imaginación le permitiera
                visualizar en su cabeza ..."""

In [23]:
# Case 1: tokenize using the spaces
tokenized_text_1 = re.split(r' ', text_test)
print(tokenized_text_1[:20])

['', 'Cuando', 'sea', 'el', 'rey', 'del', 'mundo', '(imaginaba', 'él', 'en', 'su', 'cabeza)', 'no', 'tendré', 'que', 'preocuparme', 'por', 'estas', 'bobadas.', '\n']


In [25]:
# Case 2: tokenize using Regex
tokenized_text_2 = re.split(r'[ \t\n]+', text_test)
print(tokenized_text_2[:20])

['', 'Cuando', 'sea', 'el', 'rey', 'del', 'mundo', '(imaginaba', 'él', 'en', 'su', 'cabeza)', 'no', 'tendré', 'que', 'preocuparme', 'por', 'estas', 'bobadas.', 'Era']


In [26]:
# Case 3: 
tokenized_text_2 = re.split(r'[ \W\t\n]+', text_test)
print(tokenized_text_2[:20])

['', 'Cuando', 'sea', 'el', 'rey', 'del', 'mundo', 'imaginaba', 'él', 'en', 'su', 'cabeza', 'no', 'tendré', 'que', 'preocuparme', 'por', 'estas', 'bobadas', 'Era']


### NLTK Tokenizer

In [27]:
text_sample_2 = "En los E.U. esa postal vale $15.50 ..."
tokenized_text = re.split(r'[ \W\t\n]+', text_sample_2)
print(tokenized_text[:20])

['En', 'los', 'E', 'U', 'esa', 'postal', 'vale', '15', '50', '']


In [28]:
pattern = r"""(?x)                  # set flag to allow verbose regexps
              (?:[A-Z]\.)+          # abbreviations, e.g. U.S.A
              | \w+(?:-\w+)*        # words with optional internal hyphens
              | \$?\d+(?:\.\d+)?%?  # currency and percentages
              | \.\.\.              # ellipsis
              | [][.,:"'?():-_`]    # these are separate tokens; includes [,]
"""


In [29]:
nltk.regexp_tokenize(text_sample_2, pattern)

['En', 'los', 'E.U.', 'esa', 'postal', 'vale', '$15.50', '...']