In [8]:
import collections

In [57]:
def typeA_dict(string):
    '''
    Input: A string of words
    Output: A dictionary with keys as verb roots and values as Type-A stem extenders
    '''
    typeA_words = {} #dictionary with words as keys and extenders as values
    for i in string.split():
        if i[-1] == 'ു' and i[-8:-5] != 'ക്ക' and 'ുന്നു' not in i and 'ുകയാണു' not in i: #getting the Type-A past forms that end in -u, filtering out Type-B endings
            stem = i[:-1] #stem is the word minus the past affix
            extender = stem[-1] #extender is the last character in the stem 
            if extender == stem[-3] or stem[-3] == 'ന': #for geminates like ɲɲ or nn
                extender =  stem[-3:]
            #root is stem minus extender; the 1 arg removes the last occurence of the extender characater from the stem for words like 'തൊഴുതു' where 'ത' occurs twice 
            root = ''.join(stem.rsplit(extender, 1))
            typeA_words[root] = extender
        elif i.endswith('ിട്ടുണ്ട്'): #getting the Type-A perfective forms that end in -iʈʈunɖe
            stem = i[:-9]
            if stem[-1] == 'യ' or stem[-1] == 'വ': #filtering out epenthetic forms like /y/ or /v/ before /ittunde/
                stem = stem[:-1]
            extender = stem[-1]
            if extender == stem[-3] or stem[-3] == 'ന': #for geminates
                extender =  stem[-3:]
            root = ''.join(stem.rsplit(extender, 1))
            typeA_words[root] = extender
    return typeA_words
#Note: this dictionary does not allow duplicates, so if both 'ചെയ്തു' and 'ചെയ്തിട്ടുണ്ട്' exist, only one of them will 
#be added to the dictionary; this serves our purpose.

In [58]:
def typeB_dict(string):
    '''
    Input: A string of words
    Output: A dictionary with keys as verb roots and values as Type-B stem extenders
    '''
    typeB_ending = ['ുന്നു', 'ും', ' ുക', 'ുകയാണു'] #list containing type-b forms
    temp_list = [] #temporary list for raw type-B words
    typeB_words ={} #dictionary with words as keys and extenders as values
    for i in string.split():
        temp_list.append([i for ending in typeB_ending if ending in i]) 

    temp_list = [x for x in temp_list if x != []] #to get rid of empty lists
    temp_list = [item for sublist in temp_list for item in sublist] #to get rid of nested lists
 
    for word in temp_list:
        for ending in typeB_ending:
            if word.endswith(ending):
                stem = word.strip(ending) 
                if stem[-1] == 'യ' or stem[-1] == 'വ': #filtering out epenthetic forms like /j/ before /ittunde/
                    stem = stem[:-1]
        if stem.endswith('ക്ക'): #two kinds of type-B extenders, either kk or null
            extender = 'ക്ക' 
        else:
            extender = '0'
        root = stem.rstrip(extender) 
        typeB_words[root] = extender
    return typeB_words

In [59]:
def sort_dict(dict):
    '''
    Input: Takes a unsorted dictionary
    Output: Returns a sorted dictionary
    '''
    
    temp_sorted = sorted(dict.items(), key=lambda kv: kv[1])
    sorted_dict = collections.OrderedDict(temp_sorted)
    return sorted_dict

In [64]:
def match_items(typeA_words, typeB_words):
    '''
    Input: Takes two dictionaries
    Output: Returns two dictionaries such that only the keys that are shared between the two are kept in both 
    '''
    matchtypeA={}
    matchtypeB={}
    for key in typeA_words.keys() & typeB_words.keys(): #if root in both dictionaries
        matchtypeA[key] = typeA_words[key]
        matchtypeB[key] = typeB_words[key]
    return(matchtypeA, matchtypeB)


In [63]:
def score(typeA, typeB):
    '''
    Input: Takes two dictionaries with keys as stems and values as extenders
    Output: Returns score equal to no. of groups of stems that share type-A extender and same type-B extender 
    divided by total number of groups
    '''
    grouped_ext = {} #dictionary with key as type-A extender and list of values as stems with that extender
    for key, value in sorted(typeA.items()):
        grouped_ext.setdefault(value, []).append(key)
    groups = dict(grouped_ext)

    extB_same = [] #list of type-B extenders for each group. For ex: for a group [cheyy, peyy], extB_same = [0,0]
    match = 0
    for val_group in groups.values():
        extB_same = []
        for val in val_group:
            extB = typeB[val]
            extB_same.append(extB)
        if len(extB_same) > 1: #only if group of type-B extenders has more than one element
            if any(ext == extB_same[0] for ext in extB_same): #if all elements in group of type-B extenders are equal
                match += 1
    return match

In [67]:
'''
Input: A mixed string of verbs, both Type-A stems and Type-B stems;
Items in string with hyphens marking root-extender-affix:
string = 'cheyy-t̪-u, peyy-t̪-u, ira-nn-u, upadeʃi-ch-u, para-nn-u, no-nt̪-u, ve-nt̪-u, para-kk-unu, chuma-kk-unnu, 
peyy-unnu, chumma-nn-u, upadeʃi-kk-unnu, ira-kk-unnu, cheyy-t̪-iʈʈunɖe, cheyy-ukayaanu, ada-ɲɲ-iʈʈunɖe' 
'''
string = 'ചെയ്തു പെയ്തു ഇരന്നു ഉപദേശിച്ചു പറന്നു നൊന്തു വെന്തു പറക്കുന്നു ചുമക്കുന്നു പെയ്യുന്നു ചുമന്നു ഉപദേശിക്കുന്നു ഇരക്കുന്നു ചുമക്കുന്നു ചെയ്തിട്ടുണ്ട് ചെയ്യുകയാണു അടഞ്ഞിട്ടുണ്ട്'

typeA_words = typeA_dict(string)
typeB_words = typeB_dict(string)
matchtypeA, matchtypeB = match_items(typeA_words, typeB_words)
typeA, typeB = sort_dict(matchtypeA), sort_dict(matchtypeB)
print("List of Type-A roots and extenders", typeA_words)
print("List of Type-B roots and extenders", typeB_words)
print("Score = " , score(typeA, typeB))

Lists of Type-A roots and extenders {'ചെയ്': 'ത', 'പെയ്': 'ത', 'ഇര': 'ന്ന', 'ഉപദേശി': 'ച്ച', 'പറ': 'ന്ന', 'നൊ': 'ന്ത', 'വെ': 'ന്ത', 'ചുമ': 'ന്ന', 'അട': 'ഞ്ഞ'}
Lists of Type-B roots and extenders {'പറ': 'ക്ക', 'ചുമ': 'ക്ക', 'പെയ്': '0', 'ഉപദേശി': 'ക്ക', 'ഇര': 'ക്ക', 'ചെയ്': '0'}
Score =  2
