In [1]:
import stanza
stanza.download('et')  # Download the English models

nlp = stanza.Pipeline('et', processors='tokenize,pos,lemma,depparse')

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 166MB/s]                     
2024-10-21 15:19:32 INFO: Downloading default packages for language: et (Estonian) ...
Downloading https://huggingface.co/stanfordnlp/stanza-et/resolve/v1.6.0/models/default.zip: 100%|██████████| 212M/212M [00:09<00:00, 23.5MB/s] 
2024-10-21 15:19:44 INFO: Finished downloading models and saved to /home/maria/stanza_resources.
2024-10-21 15:19:44 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 62.3MB/s]                    
2024-10-21 15:19:45 INFO: Loading these models for language: et (Estonian):
| Processor | Package      |
-----------

In [6]:
def extract_subject_of_verb(doc, target_verb):
    """
    Extracts the subject of a specified verb from a parsed Stanza document.

    Parameters:
    - doc: Parsed Stanza Document
    - target_verb: The verb whose subject needs to be extracted

    Returns:
    - subject_phrase: The extracted subject as a string
    """
    subject_phrase = ""
    
    for sent in doc.sentences:
        # Iterate through each word to find the target verb
        for word in sent.words:
            print(word)
            if word.text.lower() == target_verb.lower() and word.upos == 'VERB':
                verb_id = word.id
                # Find subjects connected to the verb
                subjects = [w for w in sent.words if w.head == verb_id and w.deprel in ('obj')]
                
                for subj in subjects:
                    # Collect all modifiers related to the subject
                    subject_tokens = [subj]
                    for w in sent.words:
                        if w.head == subj.id and w.deprel in ('det', 'amod', 'compound', 'neg', 'nummod', 'appos'):
                            subject_tokens.append(w)
                    
                    # Sort tokens based on their position in the sentence
                    subject_tokens = sorted(subject_tokens, key=lambda x: x.id)
                    
                    # Concatenate the tokens to form the subject phrase
                    subject_phrase = ' '.join([w.text for w in subject_tokens])
    
    return subject_phrase

doc=nlp("Kontrakt tehti tana.")
subject = extract_subject_of_verb(doc, 'tehti')
subject


{
  "id": 1,
  "text": "Kontrakt",
  "lemma": "kontrakt",
  "upos": "NOUN",
  "xpos": "S",
  "feats": "Case=Nom|Number=Sing",
  "head": 2,
  "deprel": "obj",
  "start_char": 0,
  "end_char": 8
}
{
  "id": 2,
  "text": "tehti",
  "lemma": "tegema",
  "upos": "VERB",
  "xpos": "V",
  "feats": "Mood=Ind|Tense=Past|VerbForm=Fin|Voice=Pass",
  "head": 0,
  "deprel": "root",
  "start_char": 9,
  "end_char": 14
}
{
  "id": 3,
  "text": "tana",
  "lemma": "tana",
  "upos": "ADV",
  "xpos": "D",
  "head": 2,
  "deprel": "advmod",
  "start_char": 15,
  "end_char": 19
}
{
  "id": 4,
  "text": ".",
  "lemma": ".",
  "upos": "PUNCT",
  "xpos": "Z",
  "head": 2,
  "deprel": "punct",
  "start_char": 19,
  "end_char": 20
}


'Kontrakt'