In [1]:
import nltk

# Tokenización

In [2]:
texto = """It is easy to get our hands on millions of words of text. What can we do with it, assuming we can write some simple programs? In this chapter we'll address the following questions:
What can we achieve by combining simple programming techniques with large quantities of text?
How can we automatically extract key words and phrases that sum up the style and content of a text?
What tools and techniques does the Python programming language provide for such work?
What are some of the interesting challenges of natural language processing?
This chapter is divided into sections that skip between two quite different styles. 
In the "computing with language" sections we will take on some linguistically motivated programming tasks without necessarily explaining how they work. 
In the "closer look at Python" sections we will systematically review key programming concepts. 
We'll flag the two styles in the section titles, but later chapters will mix both styles without being so up-front about it. 
We hope this style of introduction gives you an authentic taste of what will come later, while covering a range of elementary concepts in linguistics and computer science. 
If you have basic familiarity with both areas, you can skip to 5; we will repeat any important points in later chapters, and if you miss anything you can easily consult the online reference material at http://nltk.org/. If the material is completely new to you, this chapter will raise more questions than it answers, questions that are addressed in the rest of this book."""

In [3]:
len(texto)

1543

In [4]:
len(nltk.word_tokenize(texto))

283

In [5]:
nltk.sent_tokenize(texto)

['It is easy to get our hands on millions of words of text.',
 'What can we do with it, assuming we can write some simple programs?',
 "In this chapter we'll address the following questions:\nWhat can we achieve by combining simple programming techniques with large quantities of text?",
 'How can we automatically extract key words and phrases that sum up the style and content of a text?',
 'What tools and techniques does the Python programming language provide for such work?',
 'What are some of the interesting challenges of natural language processing?',
 'This chapter is divided into sections that skip between two quite different styles.',
 'In the "computing with language" sections we will take on some linguistically motivated programming tasks without necessarily explaining how they work.',
 'In the "closer look at Python" sections we will systematically review key programming concepts.',
 "We'll flag the two styles in the section titles, but later chapters will mix both styles wit

# Stemming

In [6]:
stemmer = nltk.stem.PorterStemmer()

In [7]:
dir(stemmer)

['MARTIN_EXTENSIONS',
 'NLTK_EXTENSIONS',
 'ORIGINAL_ALGORITHM',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_apply_rule_list',
 '_contains_vowel',
 '_ends_cvc',
 '_ends_double_consonant',
 '_has_positive_measure',
 '_is_consonant',
 '_measure',
 '_replace_suffix',
 '_step1a',
 '_step1b',
 '_step1c',
 '_step2',
 '_step3',
 '_step4',
 '_step5a',
 '_step5b',
 'mode',
 'pool',
 'stem',
 'vowels']

In [8]:
stemmer.stem("working")

'work'

In [9]:
stemmer.stem("works")

'work'

In [10]:
stemmer.stem("worked")

'work'

In [11]:
words = nltk.word_tokenize(texto)

In [13]:
len(words)

283

In [22]:
words_stem = [stemmer.stem(word) for word in words]

In [23]:
len(words_stem)

283

In [24]:
len(set(words_stem))

142

In [26]:
texto_spa = """La unión de dos almas sinceras no admite impedimentos. 
No es amor el amor que se transforma con el cambio, o se aleja con la distancia. 
¡Oh, no! Es un faro siempre firme, que desafía a las tempestades sin estremecerse. 
Es la estrella para el navio a la deriva, de valor incalculable, aunque se mída su altura. 
No es amor bufón del tiempo, aunque los rosados labios y mejillas caigan bajo el golpe de su guadaña. 
El amor no se altera con sus breves horas y semanas, sino que se afianza incluso hasta en el borde del abismo. 
Sí estoy equivocado y se demuestra, yo nunca nada escribí, y nadie jamás amó.”
"""

In [32]:
words_spa = nltk.word_tokenize(texto_spa)

In [34]:
len(words_spa)

130

In [29]:
stemmer2 = nltk.stem.snowball.SpanishStemmer()

In [None]:
dir(stemmer2)

In [33]:
words_spa_stem = [stemmer2.stem(word) for word in words_spa]

In [35]:
len(words_spa_stem)

130

In [37]:
len(set(words_spa_stem))

79

In [36]:
print(words_spa[:10])
print(words_spa_stem[:10])

['La', 'unión', 'de', 'dos', 'almas', 'sinceras', 'no', 'admite', 'impedimentos', '.']
['la', 'union', 'de', 'dos', 'almas', 'sincer', 'no', 'admit', 'impediment', '.']
