# File 02/6

# DESCRIPTION:
For each sentence in the DataFrame:
- The main verb of a compound tense construction is identified and marked (`Is_Compound_Main`).
- Auxiliary verbs are assigned to their main verb via `Compound_Aux_Forms` and `Compound_Aux_IDs`
  (token IDs of the auxiliary verbs).
- Auxiliary verbs participating in compound tense constructions are marked (`Is_Compound_Aux`).

## INPUT:
- ./OUTPUST/dataframe_02_5.csv
## OUTPUT: 
- ./OUTPUTS/dataframe_02_6.csv

In [1]:
import pandas as pd

In [2]:
# Read CSV
csv_path="OUTPUTS/dataframe_02_5.csv"
df = pd.read_csv(csv_path,
                 dtype={"Russian Translation": "string", "English Translation": "string"})

In [3]:
print(df.columns)

Index(['Unnamed: 0', 'File', 'Text Title', 'Language', 'Sentence ID',
       'Token ID', 'Form', 'Lemma', 'Lemma_norm', 'V_yva', 'V_nuti', 'V_COMB',
       'V_prefix', 'POS', 'Morphology', 'Head ID', 'Relation',
       'Presentation After', 'Russian Translation', 'English Translation',
       'Type', 'century', 'exact', 'lang', 'region', 'Negation',
       'Negation_Marker', 'place', 'Sentence_Text'],
      dtype='object')


In [4]:
def mark_compound_verbs(df):
    """
    For every sentence: mark those main verbs that have an auxiliary verb (aux) "byti". 
    Add the following three new cols to the DataFrame to the MAIN VERB: 
      - 'Is_Compound_Main'    (True, if this token is the main verb as part of a compound verb)
      - 'Compound_Aux_Forms'  (List of all auxiliary forms of 'быти' that belong to the compound verb)
      - 'Compound_Aux_IDs'    (List of all Token-IDs of the auxiliaries of 'быти' found for the according main verb)
    """
    df = df.copy()
    # Initialise new columns 
    df['Is_Compound_Main']   = False
    df['Compound_Aux_Forms'] = [[] for _ in range(len(df))]
    df['Compound_Aux_IDs']   = [[] for _ in range(len(df))]

    # group after sentence id -> so that the search will be done inside of the separate sentences 
    for sid, group in df.groupby('Sentence ID'):
        # 1) find all auxiliaries 'byti' (defined by properties: relation == "aux"; Lemma == 'быти' (='byti'), POS = "V-" )
        aux_byti = group.loc[
            (group['Lemma'] == 'быти') &
            (group['Relation'] == 'aux'),
            # For each entry: create a PANDAS DF containing the columns 'Token ID', 'Form' and 'Head ID'
            ['Token ID', 'Form', 'Head ID']
        ]

        # Mark auxiliary verbs of 'byti' in the DataFrame:
        # iterate over the Token IDs in the auxiliary DataFrame (aux_byti),
        # locate the corresponding rows in the main DataFrame (df),
        # and set 'Is_Compound_Aux' to True for those tokens
        for aux_tid in aux_byti['Token ID']:
            df.loc[df['Token ID'] == aux_tid, 'Is_Compound_Aux'] = True

        # 2) Build two maps:
        #    aux_map_forms:  main_verb_id -> list of auxiliary forms
        #    aux_map_ids:    main_verb_id -> list of auxiliary verbs' token IDs
        aux_map_forms = {}
        aux_map_ids   = {}
        for _, row in aux_byti.iterrows():
            main_id      = int(row['Head ID'])    # the auxiliary verb's ID
            form_aux     = row['Form']            # e.g. 'ѥсмь' ("jesm'")
            aux_token_id = int(row['Token ID'])   # e.g. 2157785

            aux_map_forms.setdefault(main_id, []).append(form_aux)
            aux_map_ids.setdefault(main_id, []).append(aux_token_id)

        # 3) Mark the main verbs in aux_map_forms.keys()
        for _, row in group.iterrows():
            tid = int(row['Token ID'])
            if tid in aux_map_forms:
                # Main verb's 'Is_Compound_Main' == True
                df.at[row.name, 'Is_Compound_Main']   = True
                # for main verb's row: store the list of Forms of the auxiliary verb(s) in col 'Compound_Aux_Forms'
                df.at[row.name, 'Compound_Aux_Forms'] = aux_map_forms[tid]
                # for main verb's row: store the list of IDs of the auxiliary verb(s) in col 'Compound_Aux_IDs'
                df.at[row.name, 'Compound_Aux_IDs']   = aux_map_ids[tid]

    return df

In [5]:
df = mark_compound_verbs(df)

In [6]:
df.head(10)
df_sub = df[df['Is_Compound_Main'] == True]

In [7]:
df_sub 

Unnamed: 0.1,Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,Lemma_norm,V_yva,...,lang,region,Negation,Negation_Marker,place,Sentence_Text,Is_Compound_Main,Compound_Aux_Forms,Compound_Aux_IDs,Is_Compound_Aux
11,11,mst,Mstislav’s letter,orv,189407,2157784,повелѣлъ,повелѣти,повелети,False,...,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))",True,[ѥсмь],[2157785],
116,116,mst,Mstislav’s letter,orv,189417,2157888,далъ,дати,дати,False,...,OR,East Slavic,False,,Novgorod,"ꙗ ѥсмь далъ блюдо (серебрьно, въ)",True,[ѥсмь],[2157889],
127,127,mst,Mstislav’s letter,orv,189702,2157899,велѣлъ,велѣти,велети,False,...,OR,East Slavic,False,,Novgorod,ѥсмь велѣлъ бити,True,[ѥсмь],[2157900],
287,287,mstislav-col,Colophon to Mstislav’s Gospel book,orv,213363,2305796,казалъ,казати,казати,False,...,OR,East Slavic,False,,Novgorod,Азъ съпьсахъ,True,[бѧшеть],[2305795],
487,487,birchbark,109,orv,210149,2287509,кѹпилъ,купити,купити,False,...,birchbark,East Slavic,False,,Novgorod,еси кѹпилъ робѹ,True,[еси],[2287510],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234374,234374,pvl-hyp,6420,orv,202882,2249801,въпрошалъ,въпрошати,в_прошати,False,...,Novgorod,East Slavic,False,,Pskov,бѣ въпрошалъ,True,[бѣ],[2249798],
234381,234381,pvl-hyp,6420,orv,202883,2249808,оумьрети,умрѣти,умрети,False,...,Novgorod,East Slavic,False,,Pskov,ми есть оумьрети,True,[есть],[2249807],
234456,234456,pvl-hyp,6420,orv,204114,2249884,рекъли,рещи,рещи,False,...,Novgorod,East Slavic,False,,Pskov,"помѧну конь (свои, волъстви бѧху рекъли оумрети)",True,[бѧху],[2249883],
234471,234471,pvl-hyp,6420,orv,202899,2249899,поставилъ,поставити,поставити,False,...,Novgorod,East Slavic,False,,Pskov,"конь (мои, бѣхъ поставилъ егоже) есть кде",True,[бѣхъ],[2249898],


In [8]:
df.columns

Index(['Unnamed: 0', 'File', 'Text Title', 'Language', 'Sentence ID',
       'Token ID', 'Form', 'Lemma', 'Lemma_norm', 'V_yva', 'V_nuti', 'V_COMB',
       'V_prefix', 'POS', 'Morphology', 'Head ID', 'Relation',
       'Presentation After', 'Russian Translation', 'English Translation',
       'Type', 'century', 'exact', 'lang', 'region', 'Negation',
       'Negation_Marker', 'place', 'Sentence_Text', 'Is_Compound_Main',
       'Compound_Aux_Forms', 'Compound_Aux_IDs', 'Is_Compound_Aux'],
      dtype='object')

In [9]:
assert (
    df.loc[df["Is_Compound_Main"], "Compound_Aux_IDs"]
      .apply(lambda ids: len(ids) > 0)
      .all()
), "Some compound main verbs have no auxiliary IDs"

In [10]:
assert (
    df.loc[df["Is_Compound_Main"], "Compound_Aux_Forms"]
      .apply(lambda ids: len(ids) > 0)
      .all()
), "Some compound main verbs have no auxiliary forms"

In [11]:
# get an indexed series (True/False) for condition:
# List in col "Compound_Aux_IDs" is not empty
mask_has_aux = df["Compound_Aux_IDs"].apply(lambda ids: len(ids) > 0)

# assert for these entries: 
# elements in List of col "Compound_Aux_IDs" and col "Compound_Aux_Forms"
# have the same length 
assert (
    df.loc[mask_has_aux, ["Compound_Aux_IDs", "Compound_Aux_Forms"]]
      .apply(lambda row: len(row["Compound_Aux_IDs"]) == len(row["Compound_Aux_Forms"]), axis=1)
      .all()
), "Mismatch between number of auxiliary IDs and forms"

In [12]:
df.to_csv("./OUTPUTS/dataframe_02_6.csv")

In [13]:
df.loc[mask_has_aux]

Unnamed: 0.1,Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,Lemma_norm,V_yva,...,lang,region,Negation,Negation_Marker,place,Sentence_Text,Is_Compound_Main,Compound_Aux_Forms,Compound_Aux_IDs,Is_Compound_Aux
11,11,mst,Mstislav’s letter,orv,189407,2157784,повелѣлъ,повелѣти,повелети,False,...,OR,East Slavic,False,,Novgorod,"азъ ѥсмь повелѣлъ ѿдати бѹицѣ (и (съ, съ, съ))",True,[ѥсмь],[2157785],
116,116,mst,Mstislav’s letter,orv,189417,2157888,далъ,дати,дати,False,...,OR,East Slavic,False,,Novgorod,"ꙗ ѥсмь далъ блюдо (серебрьно, въ)",True,[ѥсмь],[2157889],
127,127,mst,Mstislav’s letter,orv,189702,2157899,велѣлъ,велѣти,велети,False,...,OR,East Slavic,False,,Novgorod,ѥсмь велѣлъ бити,True,[ѥсмь],[2157900],
287,287,mstislav-col,Colophon to Mstislav’s Gospel book,orv,213363,2305796,казалъ,казати,казати,False,...,OR,East Slavic,False,,Novgorod,Азъ съпьсахъ,True,[бѧшеть],[2305795],
487,487,birchbark,109,orv,210149,2287509,кѹпилъ,купити,купити,False,...,birchbark,East Slavic,False,,Novgorod,еси кѹпилъ робѹ,True,[еси],[2287510],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234374,234374,pvl-hyp,6420,orv,202882,2249801,въпрошалъ,въпрошати,в_прошати,False,...,Novgorod,East Slavic,False,,Pskov,бѣ въпрошалъ,True,[бѣ],[2249798],
234381,234381,pvl-hyp,6420,orv,202883,2249808,оумьрети,умрѣти,умрети,False,...,Novgorod,East Slavic,False,,Pskov,ми есть оумьрети,True,[есть],[2249807],
234456,234456,pvl-hyp,6420,orv,204114,2249884,рекъли,рещи,рещи,False,...,Novgorod,East Slavic,False,,Pskov,"помѧну конь (свои, волъстви бѧху рекъли оумрети)",True,[бѧху],[2249883],
234471,234471,pvl-hyp,6420,orv,202899,2249899,поставилъ,поставити,поставити,False,...,Novgorod,East Slavic,False,,Pskov,"конь (мои, бѣхъ поставилъ егоже) есть кде",True,[бѣхъ],[2249898],


In [14]:
mask_has_aux.value_counts()

Compound_Aux_IDs
False    234166
True       1109
Name: count, dtype: int64