Merge pull request #34 from JingL1014/readerfix

fix errors in reading actor dictionary
openeventdata · Nov 11, 2016 · b868127 · b868127
2 parents e779006 + bf2b264
commit b868127
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 30 deletions.
diff --git a/petrarch2/PETRreader.py b/petrarch2/PETRreader.py
@@ -1804,16 +1804,21 @@ def read_actor_dictionary(actorfile):
     current_acts = []
     datelist = []
     while len(line) > 0:
+        #print(line)
         if line[0] == '[':  # Date
             data = line[1:-1].split()
             code = data[0]
-            try:
-                if '-' in data[1]:
-                    dates = data[1].split('-')
-                else:
-                    dates = [data[1]]
-            except:
+            if len(data)==1:
                 dates = []
+            else:
+                try:
+                    datetemp = ("").join(data[1:])
+                    if '-' in datetemp:
+                        dates = datetemp.split('-')
+                    else:
+                        dates = [datetemp]
+                except:
+                    dates = []
             datelist.append((code, dates))
         else:
             if line[0] == '+':  # Synonym
@@ -1824,46 +1829,69 @@ def read_actor_dictionary(actorfile):
                         current_acts) > 0:  # store the root phrase if we're only to use it
                     datelist.append(current_acts[0])
                 for targ in current_acts:
-                    list = PETRglobals.ActorDict
+                    actordict = PETRglobals.ActorDict
                     while targ != []:
                         if targ[0] in [' ', '']:
                             targ = targ[1:]
                             continue
-                        if not isinstance(list, dict):
-                            print("BADNESS", list)
+                        if not isinstance(actordict, dict):
+                            print("BADNESS", actordict)
                             exit()
-                        list = list.setdefault(targ[0], {})
+
+                        if targ[0] not in actordict:
+                            actordict = actordict.setdefault(targ[0], {})
+                        else:
+                            actordict = actordict[targ[0]]
                         targ = targ[1:]
-                    list["#"] = datelist
+                    if "#" not in actordict:
+                        actordict["#"] = []
+
+                    actordict["#"].extend(datelist)
 
                 datelist = []  # reset for the new actor
                 current_acts = []
                 temp = line.split('\t')
                 if len(temp)==1:
-                    temp = line.split("  ")
-                if len(temp)>1:
-                    datestring = temp[1].strip().replace("\n","").split(']')
-                    for i in range(len(datestring)):
-                        if len(datestring[i])==0:
-                            continue
+                    temp = line.split()
 
-                        data = datestring[i][datestring[i].find('[')+1:].split()
-                        code = data[0].replace(']','')
+                if len(temp)==1:
+                    actortemp = line
+                    datetemp = ""
+                elif len(temp)==2:
+                    actortemp = temp[0]
+                    datetemp = temp[1]
+                elif len(temp)>2:
+                    if line.find('[')==-1:
+                        actortemp = line
+                        datetemp = ""
+                    else:
+                        actortemp = line[0:line.find('[')]
+                        datetemp = line[line.find('['):]
 
-                        try:
-                            date = data[1].replace(']','')
-                            if '-' in date:
-                                dates = date.split('-')
-                            else:
-                                dates = [date]
-                        except:
-                            dates = []
+                datestring = datetemp.strip().replace("\n","").split(']')
+                for i in range(len(datestring)):
+                    if len(datestring[i])==0:
+                        continue
+
+                    data = datestring[i][datestring[i].find('[')+1:].split()
+                    code = data[0].replace(']','')
+
+                    try:
+                        date = data[1].replace(']','')
+                        if '-' in date:
+                            dates = date.split('-')
+                        else:
+                            dates = [date]
+                    except:
+                        dates = []
+
+                    datelist.append((code, dates))
+
+                actor = actortemp.replace("_", ' ').split()
 
-                        datelist.append((code, dates))
 
-                #print(datelist) 
-                actor = temp[0].replace("_", ' ').split()
             current_acts.append(actor)
+
 
         line = read_FIN_line().strip()
 

diff --git a/petrarch2/tests/test_petrarch.py b/petrarch2/tests/test_petrarch.py
@@ -20,6 +20,34 @@ def test_version():
 def test_read():
     assert "RUSSIA" in PETRglobals.ActorDict
 
+def test_actorDict_read():
+    #actorDict1 is an example that "CROATIA" appears multiple times in the dictionary, we should store all codes
+    actorDict1 = {u'#': [(u'YUGHRV', [u'<910625']), (u'HRVUNR', [u'911008', u'920115']), (u'HRV', [u'>920115']), (u'HRV', [])]}
+
+    #actorDict2 is an example of multiple codes in one line
+    #UFFE_ELLEMANN_JENSEN_ [IGOEUREEC 820701-821231][IGOEUREEC 870701-871231] # president of the CoEU 
+    actorDict2 = {u'ELLEMANN': {u'JENSEN': {u'#': [(u'IGOEUREEC', [u'820701', u'821231']), (u'IGOEUREEC', [u'870701', u'871231'])]}}}
+
+    #actorDict3 is an example of extra space in the date
+    #+EL_SISI_
+    #[EGYMIL 770101-120812]
+    #[EGYGOVMIL 120812-140326]
+    #[EGYGOV > 140608]
+    #[EGYELI]
+    actorDict3 = {u'#': [(u'EGYMIL', [u'770101', u'120812']), (u'EGYGOVMIL', [u'120812', u'140326']), (u'EGYGOV', [u'>140608']), (u'EGYELI', [])]}
+
+    #actorDict4-6 are examples that phrase and code is separated by different whitespace characters
+    actorDict4 = {u'HARAM': {u'#': [(u'NGAREB', [])]}} #one space
+    actorDict5 = {u'INC': {u'#': [(u'MNCUSA', [])]}} #two space
+    actorDict6 = {u'#': [(u'KIR', [])]} #one tab
+
+    assert PETRglobals.ActorDict['CROATIA'] == actorDict1
+    assert PETRglobals.ActorDict['UFFE'] == actorDict2
+    assert PETRglobals.ActorDict['EL']['SISI'] == actorDict3
+    assert PETRglobals.ActorDict['BOKO'] == actorDict4
+    assert PETRglobals.ActorDict['SOLARWINDS'] == actorDict5
+    assert PETRglobals.ActorDict['KIRIBATI'] == actorDict6
+
 
 ###################################
 #