In [30]:
# header
import pandas as pd
import ReadLift
import MatchHeadwords
import AddIPAFlex
from datetime import datetime
from FindBib import read_date
from ast import literal_eval
from GenerateLexDir import literal_eval_col

In [4]:
# update file pointers in ReadLift for new export folder
in_file = 'Flexport_2_1/Flexport_new.lift'
out_file = 'Flexport_2_1/Flexicon_NewData.csv'
senses_file = 'Flexport_2_1/Senses_NewData.csv'
ReadLift.set_filenames(in_file, out_file, senses_file)

In [5]:
# call main (note tends to take ~20sec to execute)
ReadLift.main()

<function get_entries_df at 0x7f28982b1830> 6.996680974960327
<function get_these_vars at 0x7f28982b1a70> 1.7989861965179443
<function get_senses_df at 0x7f28982b1950> 4.092197418212891
<function get_these_vars at 0x7f28982b1a70> 1.4295411109924316
<function main at 0x7f28982afcb0> 14.350279092788696


In [6]:
# read Flexicon_NewData.csv
# drop all data that's earlier than recent field trip
flex_df = pd.read_csv(out_file, keep_default_na=False, index_col='entry_id')
cutoff = datetime(2019, 11, 3)
new_data = [read_date(t) > cutoff for t in flex_df['date_modified']]
new_data = flex_df[new_data]
del flex_df
new_data.shape

(382, 10)

In [7]:
# read cleaned data
flexicon = pd.read_csv('flexiconCLEAN.csv', keep_default_na=False, index_col='entry_id')
flexicon.shape

(1501, 10)

In [8]:
# find rows in common
in_flexicon = [i in flexicon.index for i in new_data.index]
new_data.loc[:, 'in_flexicon'] = in_flexicon
in_flexicon.count(True)

32

In [9]:
# read old data
old_flexicon = pd.read_csv('flexicon.csv', keep_default_na=False, index_col='entry_id')
old_flexicon.shape

(1759, 10)

In [10]:
# rows edited by Karol and found in old_flexicon
in_old = [i in old_flexicon.index for i in new_data.index]
new_data.loc[:, 'in_old'] = in_old
in_old.count(True)

65

In [11]:
# entries that Karol has edited but I dropped while cleaning
edited_but_dropped = [old and not new for old, new in zip(in_old, in_flexicon)]
new_data.loc[:, 'edited_but_dropped'] = edited_but_dropped
edited_but_dropped.count(True)

33

In [12]:
edited_but_dropped = new_data[new_data['edited_but_dropped']]
edited_but_dropped['headword'].values


array(['edʹuuk', 'erét', 'tsananaa', 'waa', 'naga hẽ', 'asok', 'hõm',
       'nuu me', 'ji', 'tä', 'hʹỹỹb', 'mäh', 'hẽnh', 'ra', 'wapad',
       'mahang', 'takʹëp', 'aha', 'naëënh', 'né hẽ', 'soo', 'hadoo', 'sa',
       'adyyk', 'is', 'kyyh', 'biin', 'manäh', 'kapỹỹj', 'ỹỹm', 'ahỹỹh',
       'asoop', 'jajé'], dtype=object)

In [13]:
# find any words in edited_but_dropped that might be copies of entries in flexiconCLEAN

flexicon.loc[:,'ipa'] = [AddIPAFlex.to_ipa(x, bib='eppsob') for x in flexicon['headword']]
for index, row in flexicon.copy().iterrows():
    if 'Predicted phonemic form from source' in row['note']:
        flexicon.loc[index, 'ipa'] = row['headword']

edited_but_dropped.loc[:,'ipa'] = [AddIPAFlex.to_ipa(x, bib='sil') for x in edited_but_dropped['headword']]
        
matches = MatchHeadwords.match_dfs(edited_but_dropped, flexicon)
matches.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


(33, 1)

In [14]:
# save output to local machine
matches.to_csv('new_data_matches.csv')
del matches

In [15]:
# now check for words that have been edited and are present in flexiconCLEAN
updated = new_data[ [new for new in in_flexicon] ]

In [16]:
# and for new entries (not found in either flexicon)
new_entries = new_data[ [not old for old in in_old] ]
assert True not in new_entries['in_flexicon']
new_entries.shape

(317, 13)

In [23]:
# I'll have to ask Karol what's going on with the entry <tim'>, since that seems to violate
# Nadeb orthography. For now, just deleting the apostrophe so that it doesn't trigger an
# error in IPA conversion.

idx = new_entries.index[new_entries['headword'] == "tim'"].tolist()
assert len(idx) == 1
new_entries.loc[idx, 'headword'] = 'tim'

In [24]:
# match new entries to flexiconCLEAN, in case of duplicates
new_entries.loc[:, 'ipa'] = [AddIPAFlex.to_ipa(x, bib='eppsob') for x in new_entries['headword']]

matches = MatchHeadwords.match_dfs(new_entries, flexicon)
matches.shape

(317, 1)

In [34]:
not_empty = [bool(x) for x in matches['matches']]
matches = matches[not_empty]
matches.shape

(198, 1)

In [35]:
# save output to local machine
matches.to_csv('new_entries_matches.csv')
del matches