# Stanza

https://stanfordnlp.github.io/stanza/

In [17]:
import os
import stanza
from stanza.utils.conll import CoNLL

* Download and setup the French pipeline:

## Parse text

In [12]:
# Load French model tokenizer, tagger, parser and NER
nlp = stanza.Pipeline(lang='fr', processors='tokenize,pos,lemma,depparse,ner')

2024-02-07 07:36:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …



Downloading https://huggingface.co/stanfordnlp/stanza-fr/resolve/v1.7.0/models/depparse/combined_charlm.pt:   …

2024-02-07 07:37:41 INFO: Loading these models for language: fr (French):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |
| ner       | wikiner           |

2024-02-07 07:37:41 INFO: Using device: cpu
2024-02-07 07:37:41 INFO: Loading: tokenize
2024-02-07 07:37:41 INFO: Loading: mwt
2024-02-07 07:37:41 INFO: Loading: pos
2024-02-07 07:37:41 INFO: Loading: lemma
2024-02-07 07:37:41 INFO: Loading: depparse
2024-02-07 07:37:41 INFO: Loading: ner
2024-02-07 07:37:42 INFO: Done loading processors!


In [13]:
# Process text
text = "ABYDE ou ABYDOS, sub. Ville maritime de Phrygie vis-à-vis de Sestos."
text += "Xercès joignit ces deux endroits éloignés l'un de l'autre de sept stades, par le pont qu'il jetta sur l'Hellespont."

doc = nlp(text)

* Print annotations per token:

In [19]:
for sent in doc.sentences:
    for word in sent.words:
        print(f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}', sep='\n')

id: 1	word: ABYDE	head id: 0	head: root	deprel: root
id: 2	word: ou	head id: 3	head: ABYDOS	deprel: cc
id: 3	word: ABYDOS	head id: 1	head: ABYDE	deprel: conj
id: 4	word: ,	head id: 5	head: sub	deprel: punct
id: 5	word: sub	head id: 1	head: ABYDE	deprel: discourse
id: 6	word: .	head id: 1	head: ABYDE	deprel: punct
id: 1	word: Ville	head id: 9	head: joignit	deprel: nsubj
id: 2	word: maritime	head id: 1	head: Ville	deprel: amod
id: 3	word: de	head id: 4	head: Phrygie	deprel: case
id: 4	word: Phrygie	head id: 1	head: Ville	deprel: nmod
id: 5	word: vis-à-vis	head id: 1	head: Ville	deprel: advmod
id: 6	word: de	head id: 7	head: Sestos	deprel: case
id: 7	word: Sestos	head id: 5	head: vis-à-vis	deprel: obl:arg
id: 8	word: .Xercès	head id: 7	head: Sestos	deprel: flat:name
id: 9	word: joignit	head id: 0	head: root	deprel: root
id: 10	word: ces	head id: 12	head: endroits	deprel: det
id: 11	word: deux	head id: 12	head: endroits	deprel: nummod
id: 12	word: endroits	head id: 9	head: joignit	deprel: 

* Print the named entities:

In [7]:
print(doc.entities)

[{
  "text": "ABYDE",
  "type": "LOC",
  "start_char": 0,
  "end_char": 5
}, {
  "text": "ABYDOS",
  "type": "LOC",
  "start_char": 9,
  "end_char": 15
}, {
  "text": "Phrygie",
  "type": "LOC",
  "start_char": 40,
  "end_char": 47
}, {
  "text": "Sestos",
  "type": "LOC",
  "start_char": 61,
  "end_char": 67
}, {
  "text": "Hellespont",
  "type": "LOC",
  "start_char": 172,
  "end_char": 182
}]


* Print annotations in CONLL format:

In [15]:
print("{:C}".format(doc))

# text = ABYDE ou ABYDOS, sub.
# sent_id = 0
1	ABYDE	ABYDE	PROPN	_	_	0	root	_	start_char=0|end_char=5|ner=S-LOC
2	ou	ou	CCONJ	_	_	3	cc	_	start_char=6|end_char=8|ner=O
3	ABYDOS	ABYDOS	PROPN	_	_	1	conj	_	start_char=9|end_char=15|ner=S-LOC
4	,	,	PUNCT	_	_	5	punct	_	start_char=15|end_char=16|ner=O
5	sub	sub	INTJ	_	_	1	discourse	_	start_char=17|end_char=20|ner=O
6	.	.	PUNCT	_	_	1	punct	_	start_char=20|end_char=21|ner=O

# text = Ville maritime de Phrygie vis-à-vis de Sestos.Xercès joignit ces deux endroits éloignés l'un de l'autre de sept stades, par le pont qu'il jetta sur l'Hellespont.
# sent_id = 1
1	Ville	ville	NOUN	_	Gender=Fem|Number=Sing	9	nsubj	_	start_char=22|end_char=27|ner=O
2	maritime	maritime	ADJ	_	Gender=Fem|Number=Sing	1	amod	_	start_char=28|end_char=36|ner=O
3	de	de	ADP	_	_	4	case	_	start_char=37|end_char=39|ner=O
4	Phrygie	Phrygie	PROPN	_	_	1	nmod	_	start_char=40|end_char=47|ner=S-LOC
5	vis-à-vis	vis-à-vis	ADV	_	_	1	advmod	_	start_char=48|end_char=57|ner=O
6	de	de	ADP	_	_	7

* Write the doc to a file (conll format):

In [18]:
CoNLL.write_doc2conll(doc, os.path.join('output', 'sample_stanza.conllu'))