In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Annotation to JSON 

In [13]:
import re
regexp_entity = re.compile('(?P<variable>[A-Z][0-9]+)\s+(?P<type>[A-Z][A-Za-z_]*)\s+([0-9]+)\s+([0-9]+)(?P<name>(\s+[A-Za-z\'\"]+)+$)')
regexp_attribute = re.compile('(?P<variable>[A-Z][0-9]+)\s+(?P<type>[A-Z][A-Za-z_]*)\s+(?P<target_variable>[A-Z][0-9]+)\s+(?P<value>[A-Za-z]+)')
regexp_relation = re.compile('(?P<variable>[A-Z][0-9]+)\s+(([A-Z][A-Za-z_0-9]*:[A-Z][0-9]\s*)+)')

regexp_transitive = re.compile('(?P<arity>\*)\s+(?P<type>[A-Z][A-Za-z_]*)\s+(?P<refersTo1>[A-Z][0-9]+)\s+(?P<refersTo2>[A-Z][0-9]+)\s*')

In [14]:
text = r"""patty the milkmaid was going to market carrying her milk in a pail on her head
as she went along she began calculating what she would do with the money she would get for the milk
i 'll buy some fowls from farmer brown
said she
and they will lay eggs each morning which i will sell to the parson 's wife
with the money that i get from the sale of these eggs i 'll buy myself a new dimity frock and a chip hat
and when i go to market wo n't all the young men come up and speak to me
polly shaw will be that jealous
but i do n't care
i shall just look at her and toss my head like this
as she spoke she tossed her head back the pail fell off it and all the milk was spilt
so she had to go home and tell her mother what had occurred
ah my child
"""

annotation = r"""T1	Character 0 18	patty the milkmaid
A1	Gender T1 Female
T2	Says 218 222	said
E1	Says:T2 WHAT:T3 WHO:T1 WHAT2:T4 WHAT3:T5 WHAT4:T6 WHAT5:T7 WHAT6:T8 WHAT7:T9
T3	Character_Line 179 217	i 'll buy some fowls from farmer brown
T4	Character_Line 227 302	and they will lay eggs each morning which i will sell to the parson 's wife
T5	Character_Line 303 407	with the money that i get from the sale of these eggs i 'll buy myself a new dimity frock and a chip hat
T6	Character_Line 408 480	and when i go to market wo n't all the young men come up and speak to me
T7	Character_Line 481 512	polly shaw will be that jealous
T8	Character_Line 513 530	but i do n't care
T9	Character_Line 531 582	i shall just look at her and toss my head like this"""

print(annotation)

T1	Character 0 18	patty the milkmaid
A1	Gender T1 Female
T2	Says 218 222	said
E1	Says:T2 WHAT:T3 WHO:T1 WHAT2:T4 WHAT3:T5 WHAT4:T6 WHAT5:T7 WHAT6:T8 WHAT7:T9
T3	Character_Line 179 217	i 'll buy some fowls from farmer brown
T4	Character_Line 227 302	and they will lay eggs each morning which i will sell to the parson 's wife
T5	Character_Line 303 407	with the money that i get from the sale of these eggs i 'll buy myself a new dimity frock and a chip hat
T6	Character_Line 408 480	and when i go to market wo n't all the young men come up and speak to me
T7	Character_Line 481 512	polly shaw will be that jealous
T8	Character_Line 513 530	but i do n't care
T9	Character_Line 531 582	i shall just look at her and toss my head like this


In [15]:
tups = annotation.split('\n')
tups

['T1\tCharacter 0 18\tpatty the milkmaid',
 'A1\tGender T1 Female',
 'T2\tSays 218 222\tsaid',
 'E1\tSays:T2 WHAT:T3 WHO:T1 WHAT2:T4 WHAT3:T5 WHAT4:T6 WHAT5:T7 WHAT6:T8 WHAT7:T9',
 "T3\tCharacter_Line 179 217\ti 'll buy some fowls from farmer brown",
 "T4\tCharacter_Line 227 302\tand they will lay eggs each morning which i will sell to the parson 's wife",
 "T5\tCharacter_Line 303 407\twith the money that i get from the sale of these eggs i 'll buy myself a new dimity frock and a chip hat",
 "T6\tCharacter_Line 408 480\tand when i go to market wo n't all the young men come up and speak to me",
 'T7\tCharacter_Line 481 512\tpolly shaw will be that jealous',
 "T8\tCharacter_Line 513 530\tbut i do n't care",
 'T9\tCharacter_Line 531 582\ti shall just look at her and toss my head like this']

In [26]:
class Character_Line:
    def __init__(self, varname, txt, pos_start, pos_end):
        self.varname = varname
        self.txt = txt
        self.character = None
        self.pos =(pos_start, pos_end)
        
    def set_character(self, character):
        self.character = character
        
    def __str__(self):
        return "{}: {}".format(self.character.name, self.txt)
    
    def set_pos(self, start, end):
        self.pos = (start, end)
        
    
    def to_json(self):
        return {"type":"cast_line", "name":self.character.name, "line":self.txt}
    
class Character:
    def __init__(self, varname, name):
        self.name = name
        self.varname = varname
        self.gender = ''
        self.age = ''
        
    def set_gender(self, gender):
        self.gender = gender
        
    def set_age(self, age):
        self.age = age
        
    def __str__(self):
        return __repr__(self)
    
    def __repr__(self):
        return "[{}:{}/{}/{}]".format(self.varname, self.name, self.gender, self.age)
    
    def to_json(self):
        json_dict = {"type":"cast_definition", "name":self.name}
        if self.gender != '':
            json_dict['gender'] = self.gender
        if self.age != '':
            json_dict['age'] = self.age
        
        return json_dict
    

In [27]:
characters = []
character_lines = []


def find_character(var):
    for c in characters:
        if c.varname == var:
            return c
def find_character_lines(var):
    for c in character_lines:
        if c.varname == var:
            return c
        
for tup in tups:
    #print(tup)
    
    groups = regexp_entity.findall(tup)
    print(groups)
    if len(groups) > 0:
        if groups[0][1] == 'Character':
            # If the entity is a character
            characters.append(
                Character(groups[0][0].strip(), groups[0][4].strip())
            )
        elif groups[0][1] == 'Character_Line':
            character_lines.append(
                Character_Line(groups[0][0].strip(), groups[0][4].strip(), int(groups[0][2]),int(groups[0][3]))
            )
        continue
for tup in tups:
    # Attributes and relations
    groups = regexp_attribute.findall(tup)
    if len(groups) > 0:
        if groups[0][1] == 'Gender':
            # if 
            c = find_character(groups[0][2].strip())
            c.set_gender(groups[0][3].strip().lower())
        elif groups[0][1] == 'Age':
            c = find_character(groups[0][2].strip())
            c.set_age(groups[0][3].strip().lower())      
            
for tup in tups:
    # Attributes and relations
    groups = regexp_relation.findall(tup)
    if len(groups) > 0 and groups[0][1][:4] == 'Says':
        refs = groups[0][1].split()[1:]
        
        # Store who and whats
        whats = []
        who = None
        
        for ref in refs:
            type_, var = ref.split(':')
            if type_[:4] == 'WHAT':
                whats.append(var)
            elif type_[:3] == 'WHO':
                who = find_character(var)
                
        # find character lines:
        clines = [find_character_lines(w) for w in whats]
        
        # Assign characters
        for cl in clines:
            cl.set_character(who)

[('T1', 'Character', '0', '18', '\tpatty the milkmaid', ' milkmaid')]
[]
[('T2', 'Says', '218', '222', '\tsaid', '\tsaid')]
[]
[('T3', 'Character_Line', '179', '217', "\ti 'll buy some fowls from farmer brown", ' brown')]
[('T4', 'Character_Line', '227', '302', "\tand they will lay eggs each morning which i will sell to the parson 's wife", ' wife')]
[('T5', 'Character_Line', '303', '407', "\twith the money that i get from the sale of these eggs i 'll buy myself a new dimity frock and a chip hat", ' hat')]
[('T6', 'Character_Line', '408', '480', "\tand when i go to market wo n't all the young men come up and speak to me", ' me')]
[('T7', 'Character_Line', '481', '512', '\tpolly shaw will be that jealous', ' jealous')]
[('T8', 'Character_Line', '513', '530', "\tbut i do n't care", ' care')]
[('T9', 'Character_Line', '531', '582', '\ti shall just look at her and toss my head like this', ' this')]


In [7]:
whats

['T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9']

In [8]:
regexp_attribute = re.compile('(?P<variable>[A-Z][0-9]+)\s+(?P<type>[A-Z][A-Za-z_]*)\s+(?P<target_variable>[A-Z][0-9]+)\s+(?P<value>[A-Za-z]+)')


In [21]:
for c in characters:
    print(c.to_json())


{'type': 'cast_definition', 'name': 'patty the milkmaid', 'gender': 'female'}


In [22]:
for cl in character_lines:
    print(cl.to_json())

{'type': 'cast_line', 'name': 'patty the milkmaid', 'line': "i 'll buy some fowls from farmer brown"}
{'type': 'cast_line', 'name': 'patty the milkmaid', 'line': "and they will lay eggs each morning which i will sell to the parson 's wife"}
{'type': 'cast_line', 'name': 'patty the milkmaid', 'line': "with the money that i get from the sale of these eggs i 'll buy myself a new dimity frock and a chip hat"}
{'type': 'cast_line', 'name': 'patty the milkmaid', 'line': "and when i go to market wo n't all the young men come up and speak to me"}
{'type': 'cast_line', 'name': 'patty the milkmaid', 'line': 'polly shaw will be that jealous'}
{'type': 'cast_line', 'name': 'patty the milkmaid', 'line': "but i do n't care"}
{'type': 'cast_line', 'name': 'patty the milkmaid', 'line': 'i shall just look at her and toss my head like this'}


In [11]:
regexp_relation = re.compile('(?P<variable>[A-Z][0-9]+)\s+(([A-Z][A-Za-z_0-9]*:[A-Z][0-9]\s*)+)')
regexp_relation.findall("E1	Says:T2 WHAT:T3 WHO:T1 WHAT2:T4 WHAT3:T5 WHAT4:T6 WHAT5:T7 WHAT6:T8 WHAT7:T9")

[('E1',
  'Says:T2 WHAT:T3 WHO:T1 WHAT2:T4 WHAT3:T5 WHAT4:T6 WHAT5:T7 WHAT6:T8 WHAT7:T9',
  'WHAT7:T9')]