# File 02/3

## DESCRIPTION:
- Creates a new column "Lemma_norm". This column contains a graphematically simplified string based on the column "Lemma".

### Background: 
- Graphematic realizations differ in Old East Slavonic.
- Thus, for example, the realization of soft signs changed over time and was often not applied uniformly.
- In addition, the same letters may have multiple graphematic variants. This script aims to normalize such graphematic variation.
- Homonym indexing added by the editors is removed as part of the normalization process.

### INPUT_FILE:
 - "./OUTPUTS/dataframe_02_2.csv"

### OUTPUT_FILE: 
 - "./OUTPUTS/dataframe_02_3.csv"

In [1]:
%load_ext autoreload
%autoreload 2

from utils.metadata import get_meta_data
from utils.uniq_files import get_uniq_files

In [2]:
import pandas as pd 
import numpy as np
import re

In [3]:
# Typecast col "century" from int to str  
df = pd.read_csv(
    "./OUTPUTS/dataframe_02_2.csv",
    low_memory=False,
)

In [4]:
def drop_unnamed_columns(df):
    """
    delete cols "Unnamed"
    """
    return df.loc[:, ~df.columns.str.startswith("Unnamed")]

df = drop_unnamed_columns(df)

In [5]:
# utils.metadata -> get overview of most important data 
df = get_meta_data(df)

Columns:

 ['File', 'Text Title', 'Language', 'Sentence ID', 'Token ID', 'Form', 'Lemma', 'POS', 'Morphology', 'Head ID', 'Relation', 'Presentation After', 'Russian Translation', 'English Translation', 'Type', 'century', 'exact', 'lang', 'source', 'place', 'region']

Amount of rows:	 235275
Amount of cols:	 21

Unique Files: 34



Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,POS,Morphology,Head ID,...,Presentation After,Russian Translation,English Translation,Type,century,exact,lang,source,place,region
0,mst,Mstislav’s letter,orv,189407,2157773,Се,се,I-,---------n,,...,,"вот, это","behold, here is",OR,12,1130.0,OR,,Novgorod,East Slavic
1,mst,Mstislav’s letter,orv,189407,2157774,азъ,азъ,Pp,1s---mn--i,2157784.0,...,,я,I,OR,12,1130.0,OR,,Novgorod,East Slavic
2,mst,Mstislav’s letter,orv,189407,2157775,мьстиславъ,мьстиславъ,Ne,-s---mn--i,2157774.0,...,,Мстислав,Mstislav,OR,12,1130.0,OR,,Novgorod,East Slavic
3,mst,Mstislav’s letter,orv,189407,2157776,володимирь,володимирь,A-,-s---mnpsi,2157777.0,...,,Владимира,Vladimir's,OR,12,1130.0,OR,,Novgorod,East Slavic
4,mst,Mstislav’s letter,orv,189407,2157777,сн҃ъ,сынъ,Nb,-s---mn--i,2157775.0,...,,сын,son,OR,12,1130.0,OR,,Novgorod,East Slavic


In [6]:
# utils import
language_by_file = get_uniq_files(df,print_me=True)  

language_by_file: 34

{'spi', 'pskov', 'vest-kur', 'mstislav-col', 'novgorod-jaroslav', 'const', 'pvl-hyp', 'rusprav', 'schism', 'ostromir-col', 'kiev-hyp', 'afnik', 'avv', 'birchbark', 'luk-koloc', 'nov-list', 'lav', 'peter', 'ust-vlad', 'mst', 'drac', 'smol-pol-lit', 'usp-sbor', 'nov-marg', 'nov-sin', 'sergrad', 'dux-grjaz', 'riga-goth', 'zadon', 'rig-smol1281', 'domo', 'varlaam', 'pskov-ivan', 'suz-lav'}


{'mst': ['orv'], 'mstislav-col': ['orv'], 'birchbark': ['orv'], 'pskov': ['orv'], 'const': ['orv'], 'luk-koloc': ['orv'], 'lav': ['orv'], 'smol-pol-lit': ['orv'], 'nov-sin': ['orv'], 'avv': ['orv'], 'kiev-hyp': ['orv'], 'peter': ['orv'], 'vest-kur': ['orv'], 'spi': ['orv'], 'zadon': ['orv'], 'rusprav': ['orv'], 'pskov-ivan': ['orv'], 'rig-smol1281': ['orv'], 'drac': ['orv'], 'sergrad': ['orv'], 'nov-list': ['orv'], 'ostromir-col': ['orv'], 'varlaam': ['orv'], 'afnik': ['orv'], 'dux-grjaz': ['orv'], 'ust-vlad': ['orv'], 'riga-goth': ['orv'], 'domo': ['orv'], 'usp-sbor': ['orv'], 's

In [7]:
def get_list_of_base_form_verbs(column):
    """
    Create Set of Verbs based on column "Lemma"
    """
    # Filter: show only entries where POS == "V-"
    mask = df["POS"] == "V-"

    # Get base_form column
    base_forms = df.loc[mask, column]

    # Sort unique lemmas
    base_forms_list = sorted(set(base_forms.tolist()))

    return base_forms_list

# Create list by function call
base_forms_list = get_list_of_base_form_verbs("Lemma")

In [8]:
# Show list for overview
print(len(base_forms_list), "\n")
print("first 20:", base_forms_list[:20], "\n")
print("last 20:", base_forms_list[-20:])

3804 

first 20: ['*рашити', 'алъкати', 'бабити', 'безаконьновати', 'безмълвьствовати', 'беречи', 'бесчиньствовати', 'бесчьствовати', 'бесѣдовати', 'бити', 'благоволити', 'благовѣстити', 'благовѣстовати', 'благодарити', 'благодѣтьствовати', 'благодѣяти', 'благоизволити', 'благоисправляти', 'благословити', 'благословляти'] 

last 20: ['шибати', 'шинути', 'ширити', 'ширяти', 'шити', 'шпыняти', 'шумѣти', 'шьствовати', 'щадѣти', 'щемити', 'щупати', 'явити', 'являти', 'явьствовати', 'язвити', 'ясти', 'яти', 'ѣздити', 'ѣсти', 'ѣхати']


## Create file "BASE_FORM_LIST.CSV"

In [9]:
def create_base_form_list(base_form_list):
    """
    Write the list of base forms to csv. 
    """
    with open(base_form_list, "w", encoding="utf-8") as f:
        for word in base_forms_list:
            f.write(word + "\n")

# function call 
create_base_form_list("base_form_list.csv")

In [10]:
def get_all_letters_used_for_verbs(base_forms_list):
    """ 
    Extract all variants of letters used in the verbs 
    """
    all_letters = set()
    for word in base_forms_list:
        for char in word:
            all_letters.add(char)
    print("Used letters in verbs:", sorted(all_letters))
    print("Amount of unique letters:", len(all_letters))
    return all_letters

# function call 
all_letters_in_verbs_set = get_all_letters_used_for_verbs(base_forms_list)

Used letters in verbs: [' ', '#', '*', '1', '2', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'ю', 'я', 'ѣ', 'ѧ', 'ҍ']
Amount of unique letters: 37


## Normalize the letters in the verbs -> save the simplified form in "Lemma_norm" 

In [11]:
df = df.copy()

# 1. One single placeholder char for both JER variants, i.e. <ь> and <ъ>
JER = "_" 

# 2. Digraphs to be replaced before
DIGRAPHS = {
    "оу": "у",
    "ѹ": "у",
}

# 3. Mapping table
CHAR_MAP = {
    "#": "", " ": "", "1": "", "2": "", "?": "_",
    "ѣ": "е", "ҍ":"е","ѥ": "е",
    "ѧ": "я", "ѩ": "я", "ꙗ": "я",
    "ꙑ": "ы",
    "ѫ": "у","s":"з","ѕ":"з",
    # JER replacement
    "ъ": JER, "ь": JER,
}

# Regex to make sure the "s" is an ASCII-char 
# which apparently is not always true 
ASCII_S = re.compile(r"s")

def normalize_letters(text):
    if not isinstance(text, str):
        return text

    # a) Replace DIGRAPHS 
    for old, new in DIGRAPHS.items():
        text = text.replace(old, new)

    # b) All other letters where one letter corresponds to 
    # one single other letter 
    text = text.translate(str.maketrans(CHAR_MAP))
    
    return text

# function call 
# 1) Create new column 'Lemma_norm'
df.loc[:, 'Lemma_norm'] = df['Lemma'].apply(normalize_letters)

# 2) Insert 'Lemma_norm' to the right of 'Lemma' 
pos = df.columns.get_loc("Lemma")
norm = df.pop("Lemma_norm")
df.insert(pos+1, "Lemma_norm", norm)

In [12]:
df.columns

Index(['File', 'Text Title', 'Language', 'Sentence ID', 'Token ID', 'Form',
       'Lemma', 'Lemma_norm', 'POS', 'Morphology', 'Head ID', 'Relation',
       'Presentation After', 'Russian Translation', 'English Translation',
       'Type', 'century', 'exact', 'lang', 'source', 'place', 'region'],
      dtype='object')

In [13]:
assert "Lemma_norm" in df.columns, "Column 'Lemma_norm' does not exist in DataFrame"
assert df["Lemma"].isna().sum() == df["Lemma_norm"].isna().sum(), \
    "Not all lemmas in 'Lemma' have entry in 'Lemma_norm'"

In [14]:
df[:2]

Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,Lemma_norm,POS,Morphology,...,Presentation After,Russian Translation,English Translation,Type,century,exact,lang,source,place,region
0,mst,Mstislav’s letter,orv,189407,2157773,Се,се,се,I-,---------n,...,,"вот, это","behold, here is",OR,12,1130.0,OR,,Novgorod,East Slavic
1,mst,Mstislav’s letter,orv,189407,2157774,азъ,азъ,аз_,Pp,1s---mn--i,...,,я,I,OR,12,1130.0,OR,,Novgorod,East Slavic


## Inspect the newly created column "Lemma_norm"

In [15]:
# Get the forms  in "Lemma_norm"
input_col = "Lemma_norm"
base_forms_list_new = get_list_of_base_form_verbs(input_col)

print(len(base_forms_list_new))
print(base_forms_list_new[:20], "\n")

# Print list of all unique letters in "Lemma_norm"
# in order to see if any unintended letters remain
all_letters_in_verbs_set_new = get_all_letters_used_for_verbs(base_forms_list_new)

3778
['*рашити', 'ал_кати', 'б_дети', 'б_др_ствовати', 'б_рати', 'бабити', 'бегати', 'бегнути', 'бед_ствовати', 'бедити', 'бежати', 'безакон_новати', 'безм_лв_ствовати', 'беречи', 'бес_новати', 'беседовати', 'бесити', 'бесч_ствовати', 'бесчин_ствовати', 'бити'] 

Used letters in verbs: ['*', '_', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'х', 'ц', 'ч', 'ш', 'щ', 'ы', 'ю', 'я']
Amount of unique letters: 29


In [16]:
# Show which graphemes were eliminated / replaced by other graphemes 
all_letters_in_verbs_set.difference(all_letters_in_verbs_set_new)

{' ', '#', '1', '2', 'ъ', 'ь', 'ѣ', 'ѧ', 'ҍ'}

## Write file to CSV 

In [17]:
df

Unnamed: 0,File,Text Title,Language,Sentence ID,Token ID,Form,Lemma,Lemma_norm,POS,Morphology,...,Presentation After,Russian Translation,English Translation,Type,century,exact,lang,source,place,region
0,mst,Mstislav’s letter,orv,189407,2157773,Се,се,се,I-,---------n,...,,"вот, это","behold, here is",OR,12,1130.0,OR,,Novgorod,East Slavic
1,mst,Mstislav’s letter,orv,189407,2157774,азъ,азъ,аз_,Pp,1s---mn--i,...,,я,I,OR,12,1130.0,OR,,Novgorod,East Slavic
2,mst,Mstislav’s letter,orv,189407,2157775,мьстиславъ,мьстиславъ,м_стислав_,Ne,-s---mn--i,...,,Мстислав,Mstislav,OR,12,1130.0,OR,,Novgorod,East Slavic
3,mst,Mstislav’s letter,orv,189407,2157776,володимирь,володимирь,володимир_,A-,-s---mnpsi,...,,Владимира,Vladimir's,OR,12,1130.0,OR,,Novgorod,East Slavic
4,mst,Mstislav’s letter,orv,189407,2157777,сн҃ъ,сынъ,сын_,Nb,-s---mn--i,...,,сын,son,OR,12,1130.0,OR,,Novgorod,East Slavic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235270,pvl-hyp,6429,orv,203056,2250675,у҃,четырисъта,четырис_та,Ma,---------n,...,.,400,400,OR,12,,Novgorod,,Pskov,East Slavic
235271,pvl-hyp,6429,orv,203056,2250676,к҃,дъвадесяти,д_вадесяти,Ma,---------n,...,,20,20,OR,12,,Novgorod,,Pskov,East Slavic
235272,pvl-hyp,6429,orv,203056,2257914,ѳ,девять,девят_,Ma,---------n,...,·:·,9,9,OR,12,,Novgorod,,Pskov,East Slavic
235273,pvl-hyp,6429,orv,203056,2257915,,,,,,...,,,,OR,12,,Novgorod,,Pskov,East Slavic


In [18]:
df.to_csv("OUTPUTS/dataframe_02_3.csv")