In [1]:
# header
import pandas as pd
from GenerateLexDir import literal_eval_col
from FindBib import read_date

In [2]:
# load data
flexicon = pd.read_csv('FlexiconMERGE.csv', index_col='entry_id', keep_default_na=False)
merges = pd.read_csv('merge_matches.csv', index_col='entry_id', keep_default_na=False)

In [3]:
# take things literally
literal_eval_col(flexicon, 'note')
literal_eval_col(flexicon, 'other_sources')
literal_eval_col(flexicon, 'sense')
literal_eval_col(flexicon, 'these_vars')
literal_eval_col(flexicon, 'variant_of')

literal_eval_col(merges, 'matches')

In [4]:
# add a new column to merges to reflect exit status on entry pairs that couldn't be merge
merges.loc[:,'merge_error'] = ['']*len(merges)

In [5]:
# make a backup of flexicon
backup = flexicon.copy()

In [6]:
def get_bib(bib):
    bib = bib.lower()
    if "weir" in bib:
        return "Weir"
    
    elif "martins" in bib:
        return "Martins"
    
    elif "barbosa" in bib:
        return "Barbosa"
    
    elif "epps" in bib:
        if '18' in bib:
            return "Epps/Obert Fieldnotes 2018"
        elif '19' in bib:
            return "Epps/Obert Fieldnotes 2019"
        else:
            return None
    
    elif "sil" in bib:
        return "Sil Dict"
    
    else:
        return ''

In [7]:
# write method to compare entries and merge data automatically
# returns False to indicate that the entries could not be merged
def merge_entries(id1, id2):
    entry1 = flexicon.loc[id1]
    entry2 = flexicon.loc[id2]
    
    new_entry = dict()
    new_entry['other_sources'] = {}
    
    # bibliography & headword
    # probz most complicated
    bib1 = get_bib(entry1['bibliography'])
    bib2 = get_bib(entry2['bibliography'])
    
    if bib1 == bib2:
        new_entry['bibliography'] = bib1
        hdwd1 = entry1['headword'].strip()
        hdwd2 = entry2['headword'].strip()
        if hdwd1 == hdwd2:
            new_entry['headword'] = hdwd1
        else:    
            new_entry['headword'] = hdwd1 + ' %OR% ' + hdwd2
    elif 'Epps' in bib1:
        new_entry['bibliography'] = bib1
        new_entry['other_sources'][bib2] = entry2['headword']
        new_entry['headword'] = entry1['headword']
    elif 'Epps' in bib2:
        new_entry['bibliography'] = bib2
        new_entry['other_sources'][bib1] = entry1['headword']
        new_entry['headword'] = entry2['headword']
    else:
        return (False, 'Neither entry is from fieldnotes.')
    
    # date
    # prefer newer one
    date1 = entry1['date']
    date2 = entry2['date']
    new_entry['date'] = max(date1, date2)
    
    if entry1['date_modified']:
        new_entry['date_modified'] = entry1['date_modified']
    elif entry2['date_modified']:
        new_entry['date_modified'] = entry2['date_modified']
    else:
        new_entry['date_modified'] = new_entry['date']
        
    # motph_type
    # doesn't matter either
    new_entry['morph_type'] = entry1['morph_type']
    
    # note
    # try to merge keys into single dict
    # return False if have overlapping keys
    new_note = entry1['note'] if entry1['note'] else {}
    note2 = entry2['note'] if entry2['note'] else {}
    
    for k, v in note2.items():
        if k not in new_note:
            new_note[k] = v
        elif 'Predicted phonemic' in new_note[k]:
            new_note[k] = v
        elif 'Predicted phonemic' in v:
            pass
        elif new_note[k] == v:
            pass
        else:
            return (False, 'Note field has conflicting data.')
    new_entry['note'] = new_note
        
    # other_sources
    # ditto
    new_srcs = entry1['other_sources'] if entry1['other_sources'] else {}
    srcs2 = entry2['other_sources'] if entry2['other_sources'] else {}
    
    for k, v in srcs2.items():
        if k not in new_srcs:
            new_srcs[k] = v
        elif new_srcs[k] == v:
            pass
        else:
            return (False, 'other_sources field has conflicting data.')
    new_entry['other_sources'] = new_srcs
        
    # these_vars
    # same spiel
    new_vars = entry1['these_vars'] if entry1['these_vars'] else {}
    vars2 = entry2['these_vars'] if entry2['these_vars'] else {}
    
    for k, v in vars2.items():
        if k not in new_vars:
            new_vars[k] = v
        elif new_vars[k] == v:
            pass
        else:
            return (False, 'these_vars field has conflicting data.')
    new_entry['these_vars'] = new_vars
    
    # variant_of
    # keep on keepin' on
    new_varf = entry1['variant_of'] if entry1['variant_of'] else {}
    varf2 = entry2['variant_of'] if entry2['variant_of'] else {}
    
    for k, v in varf2.items():
        if k not in new_note:
            new_varf[k] = v
        elif new_varf[k] == v:
            pass
        else:
            return (False, 'variant_of field has conflicting data.')
        
    # sense
    # union of both lists
    new_entry['sense'] = entry1['sense']+entry2['sense']
    
    # pronunciation
    # preserve both if need be
    # I plan on cleaning pronunciation later
    pronc1 = entry1['pronunciation']
    pronc2 = entry2['pronunciation']
    if pronc1 and pronc2:
        new_entry['pronunciation'] = pronc1 + ' %OR% ' + pronc2
    elif pronc1:
        new_entry['pronunciation'] = pronc1
    elif pronc2:
        new_entry['pronunciation'] = pronc2
    else:
        new_entry['pronunciation'] = ''
        
    return new_entry

In [8]:
# use merge_entries function to merge entries
# (i'm sorry if you expected something else)

# start by making df w/ all merges that can be merged w/o extra steps
# (indicated by a note in the status col)
merges_to_merge = [x=='merge' for x in merges['status']]
merges_to_merge = merges[merges_to_merge]

for index, row in merges_to_merge.iterrows():
    result = None
    for id2 in row['matches'].keys():
        result = merge_entries(index, id2)
        if type(result) is tuple:
            break
        else:
            entry1 = result
    # take advantage of python's weird for-else syntax
    else:
        # success case
        flexicon.loc[index] = entry1
        for id in row['matches'].keys():
            flexicon = flexicon.drop(id)
        merges = merges.drop(index)
        continue
    # failure case
    merges.at[index, 'merge_error'] = result[1]

AttributeError: 'Series' object has no attribute 'lower'

In [None]:

merges[merges['merge_error'] != '']

In [None]:
print(backup.shape)
print(flexicon.shape)

In [None]:
backup.to_csv('FlexiconMERGE-OLD.csv')
flexicon.to_csv('FlexiconMERGE.csv')
merges.to_csv('merge_matchesREMAINING.csv')