Skip to content

Commit

Permalink
Merge pull request #34 from JingL1014/readerfix
Browse files Browse the repository at this point in the history
fix errors in reading actor dictionary
  • Loading branch information
johnb30 committed Nov 11, 2016
2 parents e779006 + bf2b264 commit b868127
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 30 deletions.
88 changes: 58 additions & 30 deletions petrarch2/PETRreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1804,16 +1804,21 @@ def read_actor_dictionary(actorfile):
current_acts = []
datelist = []
while len(line) > 0:
#print(line)
if line[0] == '[': # Date
data = line[1:-1].split()
code = data[0]
try:
if '-' in data[1]:
dates = data[1].split('-')
else:
dates = [data[1]]
except:
if len(data)==1:
dates = []
else:
try:
datetemp = ("").join(data[1:])
if '-' in datetemp:
dates = datetemp.split('-')
else:
dates = [datetemp]
except:
dates = []
datelist.append((code, dates))
else:
if line[0] == '+': # Synonym
Expand All @@ -1824,46 +1829,69 @@ def read_actor_dictionary(actorfile):
current_acts) > 0: # store the root phrase if we're only to use it
datelist.append(current_acts[0])
for targ in current_acts:
list = PETRglobals.ActorDict
actordict = PETRglobals.ActorDict
while targ != []:
if targ[0] in [' ', '']:
targ = targ[1:]
continue
if not isinstance(list, dict):
print("BADNESS", list)
if not isinstance(actordict, dict):
print("BADNESS", actordict)
exit()
list = list.setdefault(targ[0], {})

if targ[0] not in actordict:
actordict = actordict.setdefault(targ[0], {})
else:
actordict = actordict[targ[0]]
targ = targ[1:]
list["#"] = datelist
if "#" not in actordict:
actordict["#"] = []

actordict["#"].extend(datelist)

datelist = [] # reset for the new actor
current_acts = []
temp = line.split('\t')
if len(temp)==1:
temp = line.split(" ")
if len(temp)>1:
datestring = temp[1].strip().replace("\n","").split(']')
for i in range(len(datestring)):
if len(datestring[i])==0:
continue
temp = line.split()

data = datestring[i][datestring[i].find('[')+1:].split()
code = data[0].replace(']','')
if len(temp)==1:
actortemp = line
datetemp = ""
elif len(temp)==2:
actortemp = temp[0]
datetemp = temp[1]
elif len(temp)>2:
if line.find('[')==-1:
actortemp = line
datetemp = ""
else:
actortemp = line[0:line.find('[')]
datetemp = line[line.find('['):]

try:
date = data[1].replace(']','')
if '-' in date:
dates = date.split('-')
else:
dates = [date]
except:
dates = []
datestring = datetemp.strip().replace("\n","").split(']')
for i in range(len(datestring)):
if len(datestring[i])==0:
continue

data = datestring[i][datestring[i].find('[')+1:].split()
code = data[0].replace(']','')

try:
date = data[1].replace(']','')
if '-' in date:
dates = date.split('-')
else:
dates = [date]
except:
dates = []

datelist.append((code, dates))

actor = actortemp.replace("_", ' ').split()

datelist.append((code, dates))

#print(datelist)
actor = temp[0].replace("_", ' ').split()
current_acts.append(actor)


line = read_FIN_line().strip()

Expand Down
28 changes: 28 additions & 0 deletions petrarch2/tests/test_petrarch.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,34 @@ def test_version():
def test_read():
assert "RUSSIA" in PETRglobals.ActorDict

def test_actorDict_read():
#actorDict1 is an example that "CROATIA" appears multiple times in the dictionary, we should store all codes
actorDict1 = {u'#': [(u'YUGHRV', [u'<910625']), (u'HRVUNR', [u'911008', u'920115']), (u'HRV', [u'>920115']), (u'HRV', [])]}

#actorDict2 is an example of multiple codes in one line
#UFFE_ELLEMANN_JENSEN_ [IGOEUREEC 820701-821231][IGOEUREEC 870701-871231] # president of the CoEU
actorDict2 = {u'ELLEMANN': {u'JENSEN': {u'#': [(u'IGOEUREEC', [u'820701', u'821231']), (u'IGOEUREEC', [u'870701', u'871231'])]}}}

#actorDict3 is an example of extra space in the date
#+EL_SISI_
#[EGYMIL 770101-120812]
#[EGYGOVMIL 120812-140326]
#[EGYGOV > 140608]
#[EGYELI]
actorDict3 = {u'#': [(u'EGYMIL', [u'770101', u'120812']), (u'EGYGOVMIL', [u'120812', u'140326']), (u'EGYGOV', [u'>140608']), (u'EGYELI', [])]}

#actorDict4-6 are examples that phrase and code is separated by different whitespace characters
actorDict4 = {u'HARAM': {u'#': [(u'NGAREB', [])]}} #one space
actorDict5 = {u'INC': {u'#': [(u'MNCUSA', [])]}} #two space
actorDict6 = {u'#': [(u'KIR', [])]} #one tab

assert PETRglobals.ActorDict['CROATIA'] == actorDict1
assert PETRglobals.ActorDict['UFFE'] == actorDict2
assert PETRglobals.ActorDict['EL']['SISI'] == actorDict3
assert PETRglobals.ActorDict['BOKO'] == actorDict4
assert PETRglobals.ActorDict['SOLARWINDS'] == actorDict5
assert PETRglobals.ActorDict['KIRIBATI'] == actorDict6


###################################
#
Expand Down

0 comments on commit b868127

Please sign in to comment.