# Linguistic Features

In [None]:
import stanza

stanza.download('en') # download English model
nlp = stanza.Pipeline('en') # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2025-11-25 09:45:02 INFO: Downloaded file to C:\Users\erina\stanza_resources\resources.json
2025-11-25 09:45:02 INFO: Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.11.0/models/default.zip:   0%|          | …

2025-11-25 09:45:45 INFO: Downloaded file to C:\Users\erina\stanza_resources\en\default.zip
2025-11-25 09:45:50 INFO: Finished downloading models and saved to C:\Users\erina\stanza_resources
2025-11-25 09:45:50 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2025-11-25 09:45:50 INFO: Downloaded file to C:\Users\erina\stanza_resources\resources.json
2025-11-25 09:45:52 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2025-11-25 09:45:52 INFO: Using device: cpu
2025-11-25 09:45:52 INFO: Loading: tokenize
2025-11-25 09:45:55 INFO: Loading: mwt
2025-11-25 09:45:55 INFO: Loading: pos
2025-11-25 09:45:57 INFO: Loading: lemma
2025-11-25 09:45:58 INFO: Loading: constituency
2025-11-25 09:45:59 INFO: Loading: depparse
2025-11-25 09:45:59 INFO: Loading: sentiment
2025-11-25 09:46:00 INFO: Loading: ner

In [None]:
doc = nlp("Barack Obama was born in Hawaii.") # run annotation over a sentence

In [3]:
print(doc)
print(doc.entities)

[
  [
    {
      "id": 1,
      "text": "Barack",
      "lemma": "Barack",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 4,
      "deprel": "nsubj:pass",
      "start_char": 0,
      "end_char": 6,
      "ner": "B-PERSON",
      "multi_ner": [
        "B-PERSON"
      ]
    },
    {
      "id": 2,
      "text": "Obama",
      "lemma": "Obama",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 1,
      "deprel": "flat",
      "start_char": 7,
      "end_char": 12,
      "ner": "E-PERSON",
      "multi_ner": [
        "E-PERSON"
      ]
    },
    {
      "id": 3,
      "text": "was",
      "lemma": "be",
      "upos": "AUX",
      "xpos": "VBD",
      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
      "head": 4,
      "deprel": "aux:pass",
      "start_char": 13,
      "end_char": 16,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "b

In [4]:
for sentence in doc.sentences:
    for word in sentence.words:
        print(word.text, word.lemma, word.pos)

Barack Barack PROPN
Obama Obama PROPN
was be AUX
born bear VERB
in in ADP
Hawaii Hawaii PROPN
. . PUNCT


In [5]:
for sentence in doc.sentences:
    print(sentence.ents)
    print(sentence.dependencies)

[{
  "text": "Barack Obama",
  "type": "PERSON",
  "start_char": 0,
  "end_char": 12
}, {
  "text": "Hawaii",
  "type": "GPE",
  "start_char": 25,
  "end_char": 31
}]
[({
  "id": 4,
  "text": "born",
  "lemma": "bear",
  "upos": "VERB",
  "xpos": "VBN",
  "feats": "Tense=Past|VerbForm=Part|Voice=Pass",
  "head": 0,
  "deprel": "root",
  "start_char": 17,
  "end_char": 21
}, 'nsubj:pass', {
  "id": 1,
  "text": "Barack",
  "lemma": "Barack",
  "upos": "PROPN",
  "xpos": "NNP",
  "feats": "Number=Sing",
  "head": 4,
  "deprel": "nsubj:pass",
  "start_char": 0,
  "end_char": 6
}), ({
  "id": 1,
  "text": "Barack",
  "lemma": "Barack",
  "upos": "PROPN",
  "xpos": "NNP",
  "feats": "Number=Sing",
  "head": 4,
  "deprel": "nsubj:pass",
  "start_char": 0,
  "end_char": 6
}, 'flat', {
  "id": 2,
  "text": "Obama",
  "lemma": "Obama",
  "upos": "PROPN",
  "xpos": "NNP",
  "feats": "Number=Sing",
  "head": 1,
  "deprel": "flat",
  "start_char": 7,
  "end_char": 12
}), ({
  "id": 4,
  "text": "b

## Multiple Documents

In [6]:
documents = ["This is a test document.", "I wrote another document for fun."] # Documents that we are going to process
in_docs = [stanza.Document([], text=d) for d in documents] # Wrap each document with a stanza.Document object
out_docs = nlp(in_docs) # Call the neural pipeline on this list of documents
print(out_docs[1]) # The output is also a list of stanza.Document objects, each output corresponding to an input Document object

[
  [
    {
      "id": 1,
      "text": "I",
      "lemma": "I",
      "upos": "PRON",
      "xpos": "PRP",
      "feats": "Case=Nom|Number=Sing|Person=1|PronType=Prs",
      "head": 2,
      "deprel": "nsubj",
      "start_char": 0,
      "end_char": 1,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 2,
      "text": "wrote",
      "lemma": "write",
      "upos": "VERB",
      "xpos": "VBD",
      "feats": "Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin",
      "head": 0,
      "deprel": "root",
      "start_char": 2,
      "end_char": 7,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 3,
      "text": "another",
      "lemma": "another",
      "upos": "DET",
      "xpos": "DT",
      "feats": "PronType=Ind",
      "head": 4,
      "deprel": "det",
      "start_char": 8,
      "end_char": 15,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "document",
  

### Visualizations

In [2]:
from IPython.display import HTML, display
import IPython
print(IPython.__version__)

9.7.0


In [3]:
from stanza.utils.visualization.ner_visualization import visualize_strings

en_strings = ['''Samuel Jackson, a Christian man from Utah, went to the JFK Airport for a flight to New York.
                 He was thinking of attending the US Open, his favorite tennis tournament besides Wimbledon.
                 That would be a dream trip, certainly not possible since it is $5000 attendance and 5000 miles away.
                 On the way there, he watched the Super Bowl for 2 hours and read War and Piece by Tolstoy for 1 hour.
                 In New York, he crossed the Brooklyn Bridge and listened to the 5th symphony of Beethoven as well as
                 "All I want for Christmas is You" by Mariah Carey.''', 
              "Barack Obama was born in Hawaii. He was elected President of the United States in 2008"]
    
visualize_strings(en_strings, "en")

2025-11-25 10:02:41 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2025-11-25 10:02:41 INFO: Downloaded file to C:\Users\erina\stanza_resources\resources.json
2025-11-25 10:02:42 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2025-11-25 10:02:42 INFO: Using device: cpu
2025-11-25 10:02:42 INFO: Loading: tokenize
2025-11-25 10:02:51 INFO: Loading: mwt
2025-11-25 10:02:51 INFO: Loading: ner
2025-11-25 10:02:54 INFO: Done loading processors!


In [14]:
from stanza.utils.visualization.dependency_visualization import visualize_strings

en_file = "data/processed/desantis_ron/bipartisan_and_other_speeches/nbc_interview_2023_processed.txt"

with open(en_file, "r", encoding="utf-8") as file:
    data = file.read()

# Stanza needs a list of strings:
visualize_strings([data], "en")

2025-11-25 10:18:47 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2025-11-25 10:18:47 INFO: Downloaded file to C:\Users\erina\stanza_resources\resources.json
2025-11-25 10:18:48 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-11-25 10:18:48 INFO: Using device: cpu
2025-11-25 10:18:48 INFO: Loading: tokenize
2025-11-25 10:18:48 INFO: Loading: mwt
2025-11-25 10:18:48 INFO: Loading: pos
2025-11-25 10:18:51 INFO: Loading: lemma
2025-11-25 10:18:51 INFO: Loading: depparse
2025-11-25 10:18:52 INFO: Done loading processors!
