## Setup

In [1]:
import nltk
from nltk import word_tokenize, UnigramTagger as ut, BigramTagger as bt
nltk.download("punk")
nltk.download("averaged_perceptron_tagger")
nltk.download("tagsets")

# Spanish
nltk.download("cess_esp")
from nltk.corpus import cess_esp as cess
import stanza
stanza.download('es')

[nltk_data] Error loading punk: Package 'punk' not found in index
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mmenendezg/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/mmenendezg/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package cess_esp to
[nltk_data]     /Users/mmenendezg/nltk_data...
[nltk_data]   Package cess_esp is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 43.5MB/s]                    
2023-02-06 17:20:17 INFO: Downloading default packages for language: es (Spanish) ...
Downloading https://huggingface.co/stanfordnlp/stanza-es/resolve/v1.4.1/models/default.zip: 100%|██████████| 603M/603M [01:14<00:00, 8.11MB/s] 
2023-02-06 17:21:37 

## Tagging with NLTK

In [2]:
string = "And now here I am enjoying with bebe"
text = word_tokenize(string)
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('here', 'RB'),
 ('I', 'PRP'),
 ('am', 'VBP'),
 ('enjoying', 'VBG'),
 ('with', 'IN'),
 ('bebe', 'NN')]

In [3]:
for tag in ["CC", "RB", "PRP", "VBP"]:
    print(nltk.help.upenn_tagset(tag))

CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
None
RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...
None
PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us
None
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever return wag ...
None


In [4]:
string = "They do not permit other people to get residence permit"
text = word_tokenize(string)
nltk.pos_tag(text)

[('They', 'PRP'),
 ('do', 'VBP'),
 ('not', 'RB'),
 ('permit', 'VB'),
 ('other', 'JJ'),
 ('people', 'NNS'),
 ('to', 'TO'),
 ('get', 'VB'),
 ('residence', 'NN'),
 ('permit', 'NN')]

## Tagging in Spanish

In [5]:
cess_sents = cess.tagged_sents()
fraction = int(len(cess_sents) * 0.9)
uni_tagger = ut(cess_sents[:fraction])
uni_tagger.evaluate(cess_sents[fraction:])

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  uni_tagger.evaluate(cess_sents[fraction:])


0.8068832283915284

In [6]:
uni_tagger.tag("Estoy con mi bebe aqui cocinando".split(" "))

[('Estoy', 'vmip1s0'),
 ('con', 'sps00'),
 ('mi', 'dp1css'),
 ('bebe', None),
 ('aqui', None),
 ('cocinando', 'vmg0000')]

In [7]:
bi_tagger = bt(cess_sents[:fraction])
bi_tagger.evaluate(cess_sents[fraction:])

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bi_tagger.evaluate(cess_sents[fraction:])


0.10983113909559244

In [8]:
bi_tagger.tag("Estoy con mi bebe aqui cocinando".split(" "))

[('Estoy', None),
 ('con', None),
 ('mi', None),
 ('bebe', None),
 ('aqui', None),
 ('cocinando', None)]

## Stanza

In [11]:
nlp = stanza.Pipeline("es", processors="tokenize, pos")
doc = nlp("Estoy aprendiendo a procesar lenguaje natural")


2023-02-06 17:29:21 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 12.7MB/s]                    
2023-02-06 17:29:22 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| pos       | ancora  |

2023-02-06 17:29:22 INFO: Use device: cpu
2023-02-06 17:29:22 INFO: Loading: tokenize
2023-02-06 17:29:22 INFO: Loading: mwt
2023-02-06 17:29:22 INFO: Loading: pos
2023-02-06 17:29:22 INFO: Done loading processors!


In [12]:
for sentence in doc.sentences:
    for word in sentence.words:
        print(word.text, word.pos)

Estoy AUX
aprendiendo VERB
a ADP
procesar VERB
lenguaje NOUN
natural ADJ
