# File 02/4

# DESCRIPTION:
Mark verbs as negated if they fall within the scope of a negation particle.
Record the negation particle in the column Negation_Marker.

# INPUT_FILE:

./OUTPUTS/dataframe_02_3.csv

# OUTPUT_FILE:
./OUTPUTS/dataframe_02_4.csv

In [1]:
from collections import defaultdict
import pandas as pd

In [2]:
# Read csv
csv_path = 'OUTPUTS/dataframe_02_3.csv'
df = pd.read_csv(csv_path, low_memory=False)

# Drop columns beginning with "Unnamend ..." 

In [3]:
def drop_unnamed_columns(df):
    # Delete columns named "Unnamed"
    return df.loc[:, ~df.columns.str.startswith("Unnamed")]
df = drop_unnamed_columns(df)

In [4]:
print(df.columns)
df.File.unique()

Index(['File', 'Text Title', 'Language', 'Sentence ID', 'Token ID', 'Form',
       'Lemma', 'Lemma_norm', 'POS', 'Morphology', 'Head ID', 'Relation',
       'Presentation After', 'Russian Translation', 'English Translation',
       'Type', 'century', 'exact', 'lang', 'source', 'place', 'region'],
      dtype='object')


array(['mst', 'mstislav-col', 'birchbark', 'pskov', 'const', 'luk-koloc',
       'lav', 'smol-pol-lit', 'nov-sin', 'avv', 'kiev-hyp', 'peter',
       'vest-kur', 'spi', 'zadon', 'rusprav', 'pskov-ivan',
       'rig-smol1281', 'drac', 'sergrad', 'nov-list', 'ostromir-col',
       'varlaam', 'afnik', 'dux-grjaz', 'ust-vlad', 'riga-goth', 'domo',
       'usp-sbor', 'schism', 'nov-marg', 'suz-lav', 'novgorod-jaroslav',
       'pvl-hyp'], dtype=object)

In [5]:
df_cols_before = df.columns

In [6]:
df.head(2)

Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,Lemma_norm,POS,Morphology,...,Presentation After,Russian Translation,English Translation,Type,century,exact,lang,source,place,region
0,mst,Mstislav’s letter,orv,189407,2157773,Се,се,се,I-,---------n,...,,"вот, это","behold, here is",OR,12,1130.0,OR,,Novgorod,East Slavic
1,mst,Mstislav’s letter,orv,189407,2157774,азъ,азъ,аз_,Pp,1s---mn--i,...,,я,I,OR,12,1130.0,OR,,Novgorod,East Slavic


# Define "Negation" particles; parse Head and Token IDs as Integers

In [7]:
# 1. Parse IDs as Integers
df['Token ID'] = pd.to_numeric(df['Token ID'], errors='coerce').astype('Int64')
df['Head ID']  = pd.to_numeric(df['Head ID'],  errors='coerce').astype('Int64')

# 2. Initiate column "Negation" with values of Type Bool 
# Set default to "False"
df['Negation'] = False
df['Negation_Marker'] = ''

# 3. Define negation particles
negations = {'не','ни'}

In [8]:
# Check if the new cols were added 
df_cols_new = df.columns
df_cols_added = df_cols_new.difference(df_cols_before)
df_cols_added

Index(['Negation', 'Negation_Marker'], dtype='object')

## Rearrange the order of columns in the DataFrame, delete Column "source"

In [9]:
cols_at_end = ["Russian Translation ", "English Translation ", "place", "trans", "source"]
df = df[[c for c in df if c not in cols_at_end] 
        + [c for c in cols_at_end if c in df and c != "source"]]

In [10]:
df.columns

Index(['File', 'Text Title', 'Language', 'Sentence ID', 'Token ID', 'Form',
       'Lemma', 'Lemma_norm', 'POS', 'Morphology', 'Head ID', 'Relation',
       'Presentation After', 'Russian Translation', 'English Translation',
       'Type', 'century', 'exact', 'lang', 'region', 'Negation',
       'Negation_Marker', 'place'],
      dtype='object')

# Create Morphology Tree

## FUNCTION CALLS in 
- build_phrase
- build_children_map
- process_sentence

1. call process_sentence(group) # "process_sentence"
    

In [11]:
def build_children_map(group):
    """
    Create a Tree containing the relations between Token ID, Head ID and Relation

    Output: 
        Verb's "Token ID"                   {'attribute1': ['Token ID'], 'attribute2': ['Token ID'], ...  }
    Output Example:                   
        2157784 defaultdict(<class 'list'>, {'sub': [2157774], 'xadv': [2157778], 'aux': [2157785], 'obl': [2157786], 'xobj': [2157789]})
    """
    # create dict[dict[list]]
    children = defaultdict(lambda: defaultdict(list))
    # for all VERBS (i.e. "group")
    for _, row in group.iterrows():
        # determine Token ID, Head ID, Relation
        token_id = int(row['Token ID'])
        head_id = row['Head ID']
        rel = row['Relation']
        # exclude sentences where no Head ID exists 
        if pd.isna(head_id) or head_id == 0:
            continue
        parent_id = int(head_id)
        # for each verb's "Head ID"  -> 
        children[parent_id][rel].append(token_id)

    return children


In [12]:
def build_phrase(token_id, children, group):
    """
    Return a list of partial sentences (phrase_parts) containing 
    [Subject] [быти‐Aux + eventually Negation + Verb] [Objects + its Attributes]
    """

    # 1. Zeile für dieses Token holen
    row = group.loc[group['Token ID'] == token_id].iloc[0]
    form = row['Form']
    if pd.isna(form) and not pd.isna(row.get('empty-token-sort')):
        form = f"<{row['empty-token-sort']}-V>"
    elif pd.isna(form):
        form = ''

    # 2.1 Subjekt (sub, xsub)
    subs = (children.get(token_id, {}).get('sub', []) +
            children.get(token_id, {}).get('xsub', []))
    subject_phrase = ''
    if subs:
        parts_sub = []
        for sub_id in subs:
            # rekursiver Aufruf für Unter-Satz/Subjekt
            parts_sub.append(' '.join(build_phrase(sub_id, children, group)))
        subject_phrase = ' '.join(parts_sub)

    # 2.2a Hilfsverb 'быти' (aux)
    bytis = []
    for aux_id in children.get(token_id, {}).get('aux', []):
        aux_row = group.loc[group['Token ID'] == aux_id].iloc[0]
        if aux_row['Lemma'] == 'быти':
            bytis.append(aux_row['Form'])  # z.B. 'ѥсмь'

    # 2.2b ggf. Negation (aux + Lemma in {"не","ни"})
    negs = []
    for aux_id in children.get(token_id, {}).get('aux', []):
        aux_row = group.loc[group['Token ID'] == aux_id].iloc[0]
        if aux_row['Lemma'] in {'не', 'ни'}:
            negs.append(aux_row['Form'])

    # 2.3 Objekt(e) (obj, xobj)
    obj_ids = (children.get(token_id, {}).get('obj', []) +
               children.get(token_id, {}).get('xobj', []))
    object_phrases = []
    for obj_id in obj_ids:
        # rekursiver Aufruf für Objekt‐Phrase
        object_phrases.append(' '.join(build_phrase(obj_id, children, group)))

    # 2.4 Attribute (atr)
    atr_ids = children.get(token_id, {}).get('atr', [])
    attr_phrases = []
    for atr_id in atr_ids:
        # rekursiver Aufruf für Attribute
        attr_phrases.append(' '.join(build_phrase(atr_id, children, group)))

    # 2.5 Zusammengesetztes Verb (Aux + Negation + Verb‐Form)
    parts_verb = []
    if bytis:
        parts_verb += bytis        # z.B. ['ѥсмь']
    if negs:
        parts_verb += negs         # z.B. ['не']
    parts_verb.append(form)       # z.B. ['ѥсмь', 'не', 'повелѣлъ'] oder ['ѥсмь', 'повелѣлъ']
    verb_phrase = ' '.join(parts_verb)

    # 2.6 Falls Attribute (Atr) zum Verb selbst existieren, anhängen
    if attr_phrases:
        # Mehrere Attribute in Klammern, durch Komma getrennt
        verb_phrase += ' (' + ', '.join(attr_phrases) + ')'

    # 2.7 Endgültige Phrasen‐Liste zusammenbauen
    phrase_parts = []
    if subject_phrase:
        phrase_parts.append(subject_phrase)
    phrase_parts.append(verb_phrase)
    if object_phrases:
        phrase_parts.append(', '.join(object_phrases))

    return phrase_parts

In [13]:
def process_sentence(group):
    """
    - Check if a sentence contains negation elemenets, mark the sentence with 
    bool "Negation" and – if sentence is negated – write the Negation Marker(s) to
    column "Negation_Marker"
    - Get the sentence's tree structure 

    Calls function "build_children_map"
    Calls function "build_phrase"
    """
    # --- A) Check for negation and WRITE in the NEGATED VERB's COLUMN "Negation" == True
    # and "Negation_Marker": the negation marker ---
    negations = {'не','ни'}
    group = group.copy()
    # MASK: contains all verbs (df[POS]="V-")
    mask_verbs = group['POS'].fillna('').str.startswith('V-')
    for idx, verb in group[mask_verbs].iterrows():
        # get "Token ID"
        token_id = verb['Token ID']
        # Filter -> get negations corresponding to the verb:  
        negs = group.loc[
            # conditions for the negation marker: 
            # Lemma must be in the negations list 
            (group['Lemma'].isin(negations)) &
            # negation marker's "Head ID" must be the corresponding verb's token_id
            (group['Head ID'] == token_id) &
            # the negation marker's "Relation" must be "aux" (i.e. "auxiliary")
            (group['Relation'].isin(['aux'])),
            :
        ]
        # if the "negs" filter conditions are True: 
        if not negs.empty:
            # in the column of the negated verb: 
            # mark col "Negation" as True
            group.at[idx, 'Negation'] = True
            # write the Lemma(s) of "negs" to the negated verb's col "Negation_Marker" 
            group.at[idx, 'Negation_Marker'] = ', '.join(negs['Lemma'].dropna())

    # --- B) Get the Tree structure by calling "build_children_map" for each VERB ---
    children = build_children_map(group)

    # DEBUG: PRINT
    #print("CHILDREN")
    #for k,v in children.items():
    #    print(k, v)
    
    # --- C) Determine the root verb ---
    # Search for Token where "Relation" is "pred" (predicate) AND which "Head ID" is 
    # none or NaN 
    root = None
    for _, row in group.iterrows():
        if row['Relation'] == 'pred' and (pd.isna(row['Head ID']) or int(row['Head ID']) == 0):
            root = int(row['Token ID'])
            break
    
    # As sometimes a root cannot be found in the sentences of the DF: 
    # If no root is found, assume the predicate is head of the sentence, i.e. 
    # where Relation="pred"
    if root is None:
        possible = group.loc[group['Relation']=='pred']
        if not possible.empty:
            root = int(possible.iloc[0]['Token ID'])
    
    # --- D) Create the tree ---
    sentence_str = ''
    if root is not None:
        phrase_parts = build_phrase(root, children, group)
        # phrase_parts is stored as a list of strings 
        sentence_str = ' '.join([p for p in phrase_parts if p.strip() != ''])
    
    # --- E) Store the Tree Diagram in a separate Column ---
    group['Sentence_Text'] = sentence_str
    
    return group

In [14]:
# Apply fn "process_sentence" to all verbs
df = (
    df
    .groupby('Sentence ID', group_keys=False)
    .apply(process_sentence)
)

# 5. Write df to file
df.to_csv('OUTPUTS/data_with_nested_phrases.csv', index=False)

  .apply(process_sentence)


# Example for a sentence with negation:

In [15]:
# 1. Find the first Sentence ID for a sentence containing negation 
first_neg_sent = df.loc[df["Negation"], "Sentence ID"].iloc[4]

# 2. Filter all cols with this Sentence ID 
first_sentence_df = df[df["Sentence ID"] == first_neg_sent]

In [16]:
first_sentence_df[["Form", "Lemma","Sentence ID", "Token ID","POS", "Head ID", "Negation", "Negation_Marker"]]

Unnamed: 0,Form,Lemma,Sentence ID,Token ID,POS,Head ID,Negation,Negation_Marker
597,а,а,210175,2287614,C-,2287829.0,False,
598,четъ,чьто,210175,2287615,G-,2287829.0,False,
599,ѡмьшҍ,омешь,210175,2287616,Nb,2287617.0,False,
600,пришлю,присълати,210175,2287617,V-,2287615.0,False,
601,и,и,210175,2287618,C-,,False,
602,вꙑ,вы,210175,2287619,Pp,2287624.0,False,
603,имъ,и,210175,2287620,Pp,2287624.0,False,
604,къне,конь,210175,2287621,Nb,2287624.0,False,
605,мъи,мои,210175,2287622,Pp,2287621.0,False,
606,голубꙑи,голубыи,210175,2287623,A-,2287621.0,False,


In [17]:
df.to_csv("OUTPUTS/dataframe_02_4.csv")