In [122]:
import nltk
import Elan2Str
import os
import re

# Functions:

## POS Tagger:

In [123]:
def pos_tag(sentence, dic):
    tagged_sentence = []
    for word in sentence:
        
        if word in dic: 
            if len(dic[word]) == 1:
                tag = list(dic[word])[0]                        #converts to list to remove set braces from output
            else:
                tag = pos_guess_fromset([word,list(dic[word])])
        
        else:
            tag = pos_guess_unknown(word)
        tagged_sentence.append({'word': word, "pos": tag})
        
    return(tagged_sentence) 

In [124]:
def pos_guess_fromset(word):                    #word here is list with (word, {pos*})
    
    rank = dict()
    for value in word[1]:
        rank[value] = freqdic[value]
    rank = sorted(rank, key=rank.get, reverse=True)    
    tag = rank[0]
    
    if 'VERB' in word[1]:
        if re.match(".+[eiu]m", word[0]):                    # Guesses verb based on transitive morphology
            tag = 'VERB'
        elif re.match("^(.*)(doan|aot|raon|bak|ap)$", word[0]):  #guesses verb based on directional
            tag = 'VERB'
        elif word[0] in ('gat'):
            tag = 'VERB'

    #Other morphological guesses are possible here

    return(tag)   

In [125]:
def pos_guess_unknown(word):                            # Do a rule based guess function
    
    if word.istitle():                                  # if with caps we assume PROPN
        tag = 'PROPN'
    elif not re.match('[a-z]+', word):                  #if not some letters, its puctuation
        tag = 'PUNCT'
    elif re.match(".*[0-9]+.*", word):                  # if contains numbers assume numeral (data shouldn't contain numbers)
        tag = 'NUM'
    elif re.match('^a+$', word):                        # if a string of A's is assumes its hestiation
        tag = '<hes>'
    elif re.match('^ah$', word):                        # if a string of A's is assumes its hestiation
        tag = '<hes>'
    elif re.match(".+[eiu]m", word):                    # Guesses verb based on transitive morphology
        tag = 'VERB'       
    elif re.match("^(.*)(doan|aot|raon|bak|ap)$", word):  #guesses verb based on directional
        tag = 'VERB'
    elif re.match("b.long", word):                      #common variants for blong
        tag = 'ADP'
    else:
        tag = 'NOUN'        
    
    return(tag)

## Lemma Tagger:

In [126]:
def lemma_tag(sentence, dic):   #sentence is a tokenised list of lists with pos_tag
    for word in sentence:
        wordform = word['word']
        
        if re.match(".+fala$", wordform) and word['pos'] == ('ADJ' or 'NUM'):           #fala suffix
            if wordform[0:-4] + 'f' in dic :        #checks for allomorphy      X-fala vs Xf-ala  
                wordform = wordform + 'f'
            else: 
                wordform = wordform[0:0-4]
        
        redup = re.compile(r"^(.*)\1$")            #whole word reduplication
        if re.match(redup, wordform) and word['pos'] == ('ADJ' or 'VERB'):
            wordform = re.match(redup, wordform).group(1) 
        
        if word['pos'] == 'VERB':
            directional = re.compile(r"^(.*)(doan|aot|raon|bak|ap)$")
            if re.match(directional, wordform):
                if re.match(directional, wordform).group(1) in dic:
                    wordform = (re.match(directional, wordform).group(1))
        
        if word['pos'] == '<hes>' or word['pos'] == 'PUNCT':
            wordform = '_'
        word.update({'lemma' : wordform})
    return(sentence)

## Rule Based Parser:

### nominal deps:

In [127]:
def nominal_dependency(sentence):
    
    pron_head_list = []
    noun_head_list = []
    nom_head_list = []
    
    for i, word in enumerate(sentence):                          # HEAD TAGGER - finds nominal heads
        if word['pos'] in ('PRON'):                             #Mark pronouns and propoer nouns as nominal heads to find their head later
            word.update({'head': 'local_head'})
            pron_head_list.append(i)
            nom_head_list.append(i)
        elif word['pos'] in ('PROPN', 'NOUN'):
            try:                                                    #checks if final word in sentence
                sentence[i+1]
            except:
                word.update({'head': 'local_head'})
                nom_head_list.append(i)
                if word['pos'] == 'NOUN':
                    noun_head_list.append(i)
                    
                    
            else:
                if word['pos'] == 'PROPN':
                    lastpropn = i
                    if sentence[i+1]['pos'] == 'PROPN':             #check if PROPN+PROPN* construction 
                        for j,word in enumerate(sentence[i:]):      #count how many PROPN
                            if not word['pos'] == 'PROPN':
                                lastpropn += j-1
                                break
                            elif word == sentence[-1]:
                                lastpropn += j 
                    for propn in sentence[i:lastpropn]:              #Mark last PROPN as head and others as depedent
                        if propn == sentence[lastpropn]:                       
                            propn.update({'head': 'local_head'})
                            nom_head_list.append(i)
                        else:
                            propn.update({'head': lastpropn + 1, 'deprel': 'flat'})
                
                
                            
                elif word['pos'] == 'NOUN':                     
                    if not sentence[i+1]['pos'] == 'NOUN':          #check if N+N* construction and only tag final noun
                        word.update({'head': 'local_head'})
                        noun_head_list.append(i)
                        nom_head_list.append(i)
    
    
    
    for head in pron_head_list:
        toggle = 0
        for i, word in enumerate(sentence[head+1:head+2]):
            if word['head'] == '_':                                      #checks to see if word has been annotated already
                if i < 1: 
                    if word['pos'] == 'num':
                        word.update({'head': head + 1, 'deprel': 'nummod'})
                        toggle = 1
                    elif word['pos'] == 'det':
                        word.update({'head': head + 1, 'deprel': 'det'})
                if i < 2 and toggle == 1:
                    if word['pos'] == 'det':
                        word.update({'head': head + 1, 'deprel': 'det'})
                    
    
    if noun_head_list != []:
        for i, word in enumerate(sentence):
            nexthead = min([k for k in noun_head_list if k > i], default = 'local_head') 
            priorhead = max([k for k in noun_head_list if k < i], default = 'local_head')
            if type(nexthead) == int:
                nexthead += 1
            if type(priorhead) == int:
                priorhead += 1
            if word['head'] == '_':
                if word['pos'] == 'ADJ':
                    word.update({'head': nexthead, 'deprel' : 'amod'})
                elif word['pos'] == 'NUM':
                    word.update({'head': nexthead, 'deprel' : 'nummod'})
                elif word['pos'] == 'NOUN':
                    word.update({'head': nexthead, 'deprel' : 'nmod'})
                elif word['pos'] == 'DET':
                    word.update({'head': priorhead, 'deprel' : 'det'})


    if nom_head_list != []:                                                                 #adding adpoisitions 
        for i, word in enumerate(sentence):
            if word['pos'] == 'ADP':
                nexthead = min([k for k in nom_head_list if k > i], default = '_') 
                if type(nexthead) == int:
            
                    word.update({'head': nexthead + 1, 'deprel' : 'case'})
                    try:
                        sentence[nexthead]
                    except:
                        word.update({'head': 'fail'})
                    else:
                        sentence[nexthead].update({'feats' : 'loc'})    
        
                                    


    return(sentence)


### Verbal deps

Aux, part and adv to appropriate verb


- Multifunction verbs
    - Lexical listing



- Complex verbs
    - V + V sequences



- clause final verbs and other speech act

Multifunctionality of any syntax related elements

aux - preverbal
save
sae
stap
kanduit
wantem
kam
go
mas
bin
jas
sud

post-verbal:
nating
finis (post verbal or after object)
yet 
gogo 
mo


In [128]:
def verbal_dependency(sentence):
    
    svclist = ['splitem', 'brekem', 'klinem', 'blokem', 'spolem', 'hipimap', 'fasem', 'flatem', 'finisim', 'panisim', 'meksave', 'haed', 'raf', 'stil', 'taet', 'redi']
    auxlist = ['save', 'sae', 'stap', 'kanduit' , 'wantem' , 'kam', 'go' , 'mas', 'bin', 'jas' ,'sud']
    postverbalmodifiers = ['nating', 'finis', 'yet', 'gogo', 'mo']
    
    verb_list = []

    
    for i,word in enumerate(sentence):                                     # Mark out verbs
        

        if word['pos'] == 'VERB':                                           #SVCs
            if sentence[i-1]['pos'] =='VERB':
                if word['word'] in svclist:
                    word.update({'head': i, 'deprel': 'compound:svc'})
                else:
                    verb_list.append(i)
                    word.update({'head': 'verb_head'})
            else:
                verb_list.append(i)
                word.update({'head': 'verb_head'})
    
        if word['word'] in auxlist:                                        # multifucntion auxiliaries
            try:
                sentence[i+1]
            except:
                verb_list.append(i)
                word.update({'head': 'verb_head'})
            else:
                if sentence[i+1]['pos'] == 'VERB':
                    word.update({'pos': 'AUX', 'head': i+1, 'deprel': 'aux'})      
                else:
                    word.update({'head': 'verb_head'})
        
        
        elif word['word'] in postverbalmodifiers:
            try:
                sentence[i-1]
            except:
                pass
            else:        
                if sentence[i-1]['pos'] == 'VERB':
                    word.update({'pos': 'AUX', 'head': i-1, 'deprel': 'aux'})           #needs to be a bit more sophisticated  
        

    
    for i,word in enumerate(sentence):                                    
        
        if word['pos'] in ('AUX', 'PART'):                              # Mark auxiliaries and particles as aux dependents on the following verb
        
        #if there is a verb following, it is the head, if there is no verb it is the verb
            
            if word['word'] in ('i', 'oli'):
                nexthead = min([k for k in verb_list if k > i], default = 'verb_head') 
                if type(nexthead) == int:
                    nexthead += 1
                try:
                    sentence[i+1]
                except:
                    word.update({'head': 'verb_head'})    
                else:
                    if sentence[i+1]['pos'] in ('VERB', 'AUX', 'PART'):
                        word.update({'head': nexthead, 'deprel': 'aux'})
                    else:
                        word.update({'head': 'verb_head'})
            
                
                
            else:
                nexthead = min([k for k in verb_list if k > i], default = max(verb_list, default = 0)) + 1
                word.update({'head': nexthead, 'deprel': 'aux'})
                
        elif word['pos'] == 'ADV':                                      # Adverbs are assigned their closest verb as head (needs to do better but a good start)
                closesthead = min(verb_list, key=lambda x:abs(x-i), default = 0) + 1
                word.update({'head': closesthead, 'deprel' : 'advmod'})   
    

    
    
    
    return(sentence)

### head to head

local head to adposition head

adposition head to verbal head



In [129]:
def head_to_head_depedency(sentence):
    
    nominal_tags =['ADJ', 'NOUN', 'NUM']
    
    local_heads = []
    verbal_heads = []
    all_heads = []   
    
    
    for i,word in enumerate(sentence):                      # Make lists of heads

        if word['head'] == 'local_head':
            local_heads.append(i)
            all_heads.append(i)
        elif word['head'] == 'verb_head':
            verbal_heads.append(i)
            all_heads.append(i)





    for head in local_heads:                                # NPs that follow a PP modift a prior NP (check this for verbs like go)
        priorhead = max([k for k in local_heads if k < head], default = 'default') 
        if type(priorhead) == int:
            if 'loc' in sentence[head]['feats']:
                sentence[head].update({'head': priorhead + 1, 'deprel': 'nmod'})

    
    for i,head in enumerate(all_heads):                       #possessive pronouns that immediately precede a NP are said to possess that NP
        if sentence[head]['pos'] == 'PRON':
            try:
                sentence[head+1]
            except:
                pass
            else:
                if sentence[head + 1]['pos'] in (nominal_tags):                
                    sentence[head].update({'head' : head + 1, 'deprel' : 'nmod:pos'})
         
                                                                 
    
    if len(local_heads) > 0 and len(verbal_heads) > 0:          #Marks subjects of all verbs and objects of transitive verbs
        for i,head in enumerate(verbal_heads):
            subject = max([k for k in local_heads if k < head], default = 'no np')
            if type(subject) == int:
                sentence[subject].update({'head': head + 1, 'deprel': 'nsubj'})
            if re.match("^(.+[eiu]m)(|doan$|aot$|raon$|bak$|ap$)", sentence[head]['word']):                    # Guesses verb based on transitive morphology    
                object = min([k for k in local_heads if k > head], default = 'def')
                if type(object) == int:
                    sentence[object].update({'head' : head + 1, 'deprel' : 'obj'})
            
            if sentence[head]['word'] == 'i':
                object = min([k for k in local_heads if k > head], default = 'def')
                if type(object) == int:
                    sentence[object].update({'head' : head + 1, 'deprel' : 'obj'})


    
    
    
    for i,head in enumerate(verbal_heads):                          # relative clauses
        if i < 1:
            start = 0
        else:
            start = verbal_heads[i-1]
        
        for word in sentence:
            if word['word'] == 'we':
 
                for word2 in sentence[start:head]:
                    if word2['deprel'] in ('local_head', 'nsubj', 'obj'):
                        sentence[head]['head'] = sentence.index(word2) + 1
                        sentence[head]['deprel'] = 'acl:relcl'      #mark as dependent on the first NP
                        break
        
                    else:
                        sentence[head]['head'] = 'headless_relative'       #If not mark as a headless relative clause
                        break
                break      
            
            
    if len(verbal_heads) > 1:        
        toggle = 0
        relativizer = 0
        for i,word in enumerate(sentence):                                               # Adverbial clauses
            
            if word['word'] in ('se', 'blong', 'long', 'blo', 'bl'):
                
                toggle = 1
                relativizer = i 
                
            elif toggle == 1:
                if word['head'] == 'verb_head':         
                    sentence[relativizer]['head'] = i + 1
                    sentence[relativizer]['deprel'] = 'mark'
                    word['head'] = 'link_to_root'
                    word['deprel'] = 'advcl'
                    break


## Recalculate the heads:

    local_heads = []
    verbal_heads = []
    all_heads = []   
    
    
    for i,word in enumerate(sentence):                      # Make lists of heads

        if word['head'] == 'local_head':
            local_heads.append(i)
            all_heads.append(i)
        elif word['head'] == 'verb_head':
            verbal_heads.append(i)
            all_heads.append(i)



## Find the root
    
    root = 0
                    
    if len(all_heads) == 1:                                    # If there is just one possible head, it is the root
        sentence[all_heads[0]].update({'head' : 0, 'deprel': 'root'})
        root = all_heads[0]

    elif len(verbal_heads) == 1:                                 # If there is just one verb head, it is the root (could be better)
        sentence[verbal_heads[0]].update({'head' : 0, 'deprel': 'root'})
        root = verbal_heads[0]

    elif len(verbal_heads) > 1: 
        sentence[verbal_heads[0]]['head'] == 'root'
        root = verbal_heads[0]

    root += 1
# Mark remaining elements as obliques or adverbial clauses

    for word in sentence:
        if word['head'] in ('local_head', 'headless_relative'):
            word['head'] = root
            word['deprel'] = 'obl'
        elif word['head'] in ('verbal_head', 'link_to_root'):
            word['head'] = root
            word['deprel'] = 'advcl'
        elif word['head'] == '_':
            word['head'] = root
            word['deprel'] = 'adv'
        

    
    return(sentence)

In [130]:
def parse(sentence):

    for word in sentence:
        if 'head' not in word:
            word.update({'head' : '_'})
        if 'deprel' not in word:
            word.update({'deprel' : '_'})
        if 'feats' not in word:
            word.update({'feats' : '_'})

    sentence = nominal_dependency(sentence)

    sentence = verbal_dependency(sentence)
    
    sentence = head_to_head_depedency(sentence)
    
    
    return(sentence)
    

## Code to run:

Load in lexicon:

In [131]:
bis_dictionary = {}

with open("../data/BIS_Dictionary_3col.replaced.csv") as f:
    for line in f:
        (k, v1, v2) = line.split(',')
        if k in bis_dictionary:
            bis_dictionary[k].add(v1)
        else:
            bis_dictionary[k] = {v1}

freqdic = {'NOUN': 12, 'VERB' : 11, 'PRON' : 10, 'ADP' : 13, 'PART' : 13, 'ADV' : 4, 'ADJ' : 6, 'NUM': 5, 'CCONJ': 4, 'DET': 3, 'INTJ': 2, 'PROPN': 1, 'AUX': 14}

Iterate over files, extract text and tag text

In [132]:
tree = ""

datadir = '../data/BIS_20230512/'

dic = bis_dictionary

for file in os.listdir(datadir):
    filepath = datadir + file
    text = Elan2Str.elan2str(filepath, 'default')
    sentences = nltk.sent_tokenize(text)
    counter = 0
    for sentence in sentences:
        counter += 1
        metadata = '# ' + 'sent_id = ' + str(file) + '.' + str(counter) + '\n' + '# ' + 'text = ' + str(sentence) + '\n' #maybe make these variables 
        tree += metadata
        word_counter = 0
        sentence = sentence.strip('. ')                                 #strip here removes final stops
        sentence = re.sub("\s*\%.*?\%\s*", "", sentence)                #Removes tags marked with '%'
        if not re.match("^\s*[\,\.\(\)\{\}\[\]]*\s*$", sentence):       #Ignores sentences which are empty or punctuation
            token_sentence = nltk.word_tokenize(sentence)               #Tokenise
            pos_tagged_sentence = pos_tag(token_sentence, dic)          #POS tag
            lemma_tagged_sentence = lemma_tag(pos_tagged_sentence, dic) #Lemmatise
            parsed_sentence = parse(lemma_tagged_sentence)              #Depedency parser
            for word in parsed_sentence:
                word_counter += 1
                tree += str(word_counter) + '\t' + word['word'] + '\t' + word['lemma'] + '\t' + word['pos'] + '\t' + '_' + '\t' +  '_' + '\t' + str(word['head']) + '\t' + word['deprel'] + '\t' + '_' + '\t' + '_' + '\n'    
        tree += '\n'
        
        
with open('Bislama_Tree.conllu', 'w') as output:
    output.write(tree)