In [48]:
# find all entries spread across FlexiconCLEAN and Flexicon_NewData that are uncertain
# entries originating from texts (these are notoriously messy and contain a lot of
# duplicates and redundancies)
# remove from FlexiconCLEAN.csv, new_data_matches.CSV, new_entries_matchesNEW.csv, and
# Flexicon_NewData.csv

# header
import pandas as pd
from GenerateLexDir import literal_eval_col

In [49]:
# initialize dfs
flexicon = pd.read_csv('flexiconCLEAN.csv', index_col='entry_id', keep_default_na=False)
new_data = pd.read_csv('Flexport_2_1/Flexicon_NewData.csv', index_col='entry_id', keep_default_na=False)
new_matches  = pd.read_csv('new_entries_matchesNEW.csv', index_col='entry_id', keep_default_na=False)
edit_matches = pd.read_csv('new_data_matches.csv', index_col='entry_id', keep_default_na=False)
senses = pd.read_csv('flex_senses.csv', index_col='sense_id', keep_default_na=False)
new_senses = pd.read_csv('Flexport_2_1/Senses_NewData.csv', index_col='sense_id', keep_default_na=False)

In [50]:
# find cartilha entries
# should be easy, as all entries will have 'cartilha' or 'lição/lições' somewhere in the
# note or variant column
# create list of entry_ids so we can remove it from all df's easily

# don't call literal_eval yet so we can easily search thru row data as strings
cartilha_idcs = set()
cartilha_df = pd.DataFrame(columns = ['headword', 'morph_type', 'pronunciation',\
'variant_of', 'these_vars', 'note', 'sense', 'date', 'date_modified', 'other_sources'])

for index, row in flexicon.copy().iterrows():
    if 'cartilha' in row['note'] or 'liç' in row['note']\
    or 'cartilha' in row['variant_of'] or 'liç' in row['variant_of']:
        cartilha_idcs.add(index)
        flexicon = flexicon.drop(index)
        row['date_modified'] = row['date']
        cartilha_df.loc[index] = row

for index, row in new_data.iterrows():
    if 'cartilha' in row['note'] or 'liç' in row['note']\
    or 'cartilha' in row['variant_of'] or 'liç' in row['variant_of']:
        cartilha_idcs.add(index)
        new_data = new_data.drop(index)
        row['other_sources'] = {}
        cartilha_df.loc[index] = row

len(cartilha_idcs)

123

In [51]:
# next step is to identify entries Emily identified from texts
# entries w/ a question mark in the PoS (or no PoS) are one
no_pos = [(not x) or ('?' in x) for x in new_senses['pos']]

# ditto for gloss and definition
# however, since often only one of gloss and definition
# fields contain data, concatenating both strs before
# checking

def_and_gloss = [d+g for d, g in zip(new_senses['gloss'], new_senses['def'])]
no_def = [(not x) or ('?' in x) for x in def_and_gloss]

text_senses = [x or y for x, y in zip(no_pos, no_def)]
text_senses = new_senses.index[text_senses]
text_senses

Index(['6e8e07f3-28a9-4adb-b230-f819c944b12a',
       '1a7f5cab-b082-42eb-8cac-7781db2f1151',
       '9e409799-d500-44e1-97b4-66925497edde',
       '3954f8c6-283b-4e9f-a9b1-5e9ea6096fcc',
       'a8675526-039d-459c-b4c8-c3434645200d',
       '76dbc6d9-b69a-428f-8404-501dd5aa7637',
       'baa0255c-d216-40c6-9314-c2b8ae598c65',
       '9590b51f-1870-494d-872b-0b8ee8932571',
       '93710c4c-822f-4475-9c65-0f1886aa7db0',
       'b76b93ca-1032-4a40-9a95-1031ce3ca8d9',
       ...
       '9efb291e-c7ac-420a-8553-1d28627f81ec',
       '9459cf68-19cc-45a7-8abd-b9d5d0d37887',
       '3cf869fa-9f12-4d60-af32-0530c32fce8a',
       '1ea498e1-3443-461b-aa55-d2d5b01ba48f',
       '268369b7-3c64-46c9-8966-1b427339bd10',
       'acd92b03-d069-4843-ae87-ad2288915bce',
       '4713b0d1-e327-4b22-a41b-7db7430fe70e',
       'b0db2efd-e199-42a1-969e-94d3131b1479',
       'cb2b1a3f-9d04-468c-a497-8395bd836a75',
       '34fe7ac1-e02e-46ae-b874-ca2c7cf8a244'],
      dtype='object', name='sense_id', length=32

In [52]:
# find entries in new_data originating from texts
# using text_senses to identify them
text_entries = [any(t_s in x for t_s in text_senses) for x in new_data['sense']]
keep = [not x for x in text_entries]
text_entries = new_data[text_entries]
new_data = new_data[keep]

In [53]:
# cover our steps- in theory, the tests we used to find text data from the recent import
# should return no results for flexiconCLEAN - since we already segmented out Cartilha
# data
# let's make sure
no_pos = [(not x) or ('?' in x) for x in senses['pos']]
no_pos.count(True)

def_and_gloss = [d+g for d, g in zip(senses['gloss'], senses['def'])]
no_def = [(not x) or ('?' in x) for x in def_and_gloss]
            
text_senses = [x or y for x, y in zip(no_pos, no_def)]
text_senses = senses.index[text_senses]
len(text_senses)

193

In [54]:
flexicon_text_entries = [any(t_s in x for t_s in text_senses) for x in flexicon['sense']]
keep = [not x for x in flexicon_text_entries]
flexicon_text_entries = flexicon[flexicon_text_entries]
flexicon = flexicon[keep]
flexicon_text_entries.shape

(169, 10)

In [55]:
# well, that didn't go as expected.
# might as well aggregate all three df's into one
# and write to csv

text_entries.loc[:,'other_sources'] = [{} for i in text_entries.iterrows()]
flexicon_text_entries.loc[:, 'date_modified'] = flexicon_text_entries['date']

all_text_entries = pd.concat( [cartilha_df, flexicon_text_entries, text_entries] )
all_text_entries.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


(534, 11)

In [57]:
all_text_entries.to_csv('all_text_entries.csv')
new_data.to_csv('Flexport_2_1/Flexicon_NewDataPART.csv')
flexicon.to_csv('flexiconPART.csv')