# Conversion of the Atlas of Pidgin and Creole Languages language examples to Ligt model
## As submitted to LDK-2021
Data is taken from: https://github.com/cldf-datasets/apics/releases , label `v2013` (latest)

In order for this notebook to run properly, `data/glottolog_language_n3.gz` and `data/cldf.zip` must be unzipped

In [1]:
%cd './data/'

/Users/max/Projects/LiODi/ligt/stable/apics/data


In [2]:
# !pip install pycldf

## Parsing APiCS

In [3]:
import pycldf
import csv
import re

In [4]:
apics_dataset = pycldf.Wordlist.from_metadata('StructureDataset-metadata.json')

A sample data item from the `LanguageTable` collection:

In [5]:
next(iter(apics_dataset['LanguageTable']))

OrderedDict([('ID', '1'),
             ('Name', 'Early Sranan'),
             ('Macroarea', None),
             ('Latitude', Decimal('5.833333')),
             ('Longitude', Decimal('-55.6')),
             ('Glottocode', 'sran1240'),
             ('ISO639P3code', None),
             ('Description',
              'Over the years multiple historical documents in and on the English-base creole language of Suriname known as Sranan or Sranantongo have been uncovered, resulting in a substantial digitized corpus of eighteenth-century texts. These texts, stored in the Suriname Creole Archive at the Radboud University, Nijmegen, provide a unique window on the Sranan language as it was spoken in the eighteenth century, that is, at earlier stages of its development. In several historical sources phonological, grammatical, semantic, and pragmatic differences between varieties of the creole language are acknowledged, and some varieties appear to have been so different that they are known under dist

In [6]:
languages = {lang['ID']: (lang['Name'], lang['Glottocode'], lang['ISO639P3code']) for lang in apics_dataset['LanguageTable']}

In [7]:
languages['1']

('Early Sranan', 'sran1240', None)

A sample example sentence:

In [8]:
next(iter(apics_dataset['ExampleTable']))

OrderedDict([('ID', '1-1'),
             ('Language_ID', '1'),
             ('Primary_Text', 'Isredeh mi kau bringi wan mannpikin.'),
             ('Analyzed_Word',
              ['Isrede', 'mi', 'kau', 'bringi', 'wan', 'manpikin.']),
             ('Gloss',
              ['yesterday', '1SG', 'cow', 'deliver', 'a', 'male.young']),
             ('Translated_Text', 'Yesterday my cow delivered a bull calf.'),
             ('Meta_Language_ID', None),
             ('Comment', None),
             ('Source', ['1357[22]']),
             ('Audio', None),
             ('Type', 'written (dictionary)'),
             ('markup_text', 'Isredeh mi kau bringi wan mannpikin.'),
             ('markup_analyzed', 'Isrede mi kau bringi wan manpikin.'),
             ('markup_gloss', 'yesterday 1SG cow deliver a male.young'),
             ('markup_comment', None),
             ('source_comment', None),
             ('original_script', None),
             ('sort', '1'),
             ('alt_translation',
        

In [9]:
def align_glosses(morphs, glosses, example):
    if len(morphs) != len(glosses):
        print(morphs, glosses, example)
    return list(zip(morphs, glosses))

There are some examples that don't have gloss/word splitting, we skip them for now

In [10]:
sent_not_split = [example for example in apics_dataset['ExampleTable'] if len(example['Analyzed_Word']) == 1 and ' ' in example['Analyzed_Word'][0]]

print(len(sent_not_split))
sent_not_split

58


[OrderedDict([('ID', '75-8'),
              ('Language_ID', '75'),
              ('Primary_Text', 'la fiy opaapaawa'),
              ('Analyzed_Word', ['la fiy o-paapaa-wa']),
              ('Gloss', ['the.F (F) girl (F) 3.POSS-father-OBV (C)']),
              ('Translated_Text', "the girl's father"),
              ('Meta_Language_ID', None),
              ('Comment', 'F = from French; C = from Cree'),
              ('Source', []),
              ('Audio', None),
              ('Type', 'constructed by linguist'),
              ('markup_text', 'la fiy opaapaawa'),
              ('markup_analyzed', 'la fiy o-paapaa-wa'),
              ('markup_gloss', 'the.F (F) girl (F) 3.POSS-father-OBV (C)'),
              ('markup_comment', 'F = from French; C = from Cree'),
              ('source_comment', 'Own knowledge'),
              ('original_script', None),
              ('sort', '384'),
              ('alt_translation', None)]),
 OrderedDict([('ID', '21-12'),
              ('Language_ID', '21

In [11]:
sent_uri_template = 'http://apics-online.info/sentences/{ID}'

examples = [{'id': example['ID'], 
             'orig_id': sent_uri_template.format(ID=example['ID']), 
             'baseline': example['Primary_Text'],
             'glosses': align_glosses(example['Analyzed_Word'], example['Gloss'], example),
             'translation': example['Translated_Text'],
             'language': languages[example['Language_ID']],
             'meta_language': example['Meta_Language_ID'],
             'comment': example['Comment']
            } for example in apics_dataset['ExampleTable'] if not (len(example['Analyzed_Word']) == 1 and ' ' in example['Analyzed_Word'][0])]

In [12]:
len(examples)

18468

In [13]:
for item in examples[:20]:
    print (list(item['glosses']))

[('Isrede', 'yesterday'), ('mi', '1SG'), ('kau', 'cow'), ('bringi', 'deliver'), ('wan', 'a'), ('manpikin.', 'male.young')]
[('Da', 'DET.SG'), ('masra', 'master'), ('teki', 'take'), ('mi', '1SG'), ('wefi', 'wife'), ('na', 'in'), ('neti', 'night'), ('nanga', 'with'), ('tranga', 'strong'), ('ai.', 'eye')]
[('A', 'DET'), ('mama', 'mother'), ('fon', 'beat'), ('a', 'DET'), ('pikin.', 'child')]
[('A', 'DET'), ('boi', 'boy'), ('lobi', 'love'), ('a', 'DET'), ('umapikin.', 'girl')]
[('A', '3SG'), ('téi', 'take'), ('dí', 'DEF.SG'), ('páu', 'stick'), ('páá.', 'quick')]
[('Wojo', 'eye'), ('u', 'for'), ('mi', '1SG'), ('á', 'NEG'), ('sa', 'M'), ('kai', 'fall'), ('ku', 'with'), ('di', 'DEF.SG'), ('faja.', 'fire')]
[('Di', 'DEF.SG'), ('mujɛɛ', 'woman'), ('naki', 'hit'), ('di', 'DEF.SG'), ('womi.', 'man')]
[('Den', 'DET.PL'), ('pikinnenge', 'child'), ('e', 'IPFV'), ('lobi', 'love/like'), ('switi', 'sweet'), ('sii.', 'seeds')]
[('kooknot', 'coconut'), ('bring', 'bring.forth'), ('ail', 'oil')]
[('Shi', '3

## Creating RDF

In [14]:
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, DC, DCTERMS
from rdflib.term import URIRef, Literal

### Reading Glottolog

In [17]:
glottolog = rdflib.Graph()
glottolog.parse('glottolog_language.n3', format='n3')

lexvo = rdflib.Namespace('http://lexvo.org/ontology#')

In [18]:
def get_iso_code(glottocode):
    glottolog_template = 'http://glottolog.org/resource/languoid/id/{lang_id}'
    glottocode_uri = URIRef(glottolog_template.format(lang_id=glottocode))
    
    return glottolog.value(subject=glottocode_uri, predicate=lexvo.iso639P3PCode)

### Creating APiCS graph

In [19]:
g = rdflib.Graph(identifier='http://purl.org/liodi/ligt/apics')

In [20]:
sentences = rdflib.Namespace('http://apics-online.info/sentences/')
apics = rdflib.Namespace('http://purl.org/liodi/ligt/apics/')
ligt = rdflib.Namespace('http://purl.org/ligt/ligt-0.2#')
nif = rdflib.Namespace('http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#')

In [21]:
g.bind('ligt', ligt)
g.bind('nif', nif)
g.bind('rdfs', RDFS)
g.bind('owl', OWL)
g.bind('dct', DCTERMS)

g.bind('apics', apics)
apics_doc = URIRef(apics)

In [24]:
g.set((apics_doc, RDF.type, ligt.Document))
g.set((apics_doc, ligt.hasUtterances, apics.examples))

g.set((apics.examples, RDF.type, ligt.InterlinearCollection))
# Probably it should also be a `dc:bibliographicCitation`
g.set((apics.examples, RDFS.comment, Literal(apics_dataset.properties['dc:bibliographicCitation'], lang="en")))

In [25]:
print(g.serialize(format='turtle').decode('utf-8'))

@prefix apics: <http://purl.org/liodi/ligt/apics/> .
@prefix ligt: <http://purl.org/ligt/ligt-0.2#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

apics: a ligt:Document ;
    ligt:hasUtterances apics:examples .

apics:examples a ligt:InterlinearCollection ;
    rdfs:comment "Michaelis, Susanne Maria & Maurer, Philippe & Haspelmath, Martin & Huber, Magnus (eds.) 2013. Atlas of Pidgin and Creole Language Structures Online. Leipzig: Max Planck Institute for Evolutionary Anthropology."@en .




In [26]:
examples[165]

{'id': '72-1',
 'orig_id': 'http://apics-online.info/sentences/72-1',
 'baseline': 'Jintaku karungku i bin gedim kengkaru mirlarrangyawung.',
 'glosses': [('Jintaku', 'one'),
  ('karu-ngku', 'child-ERG'),
  ('i', '3SG.SBJ'),
  ('bin', 'PST'),
  ('ged-im', 'shoot-TR'),
  ('kengkaru', 'kangaroo'),
  ('mirlarrang-yawung.', 'spear-COM')],
 'translation': 'One kid got the kangaroo with a spear.',
 'language': ('Gurindji Kriol', 'guri1249', None),
 'meta_language': None,
 'comment': 'The pronoun-verb order is SVO, as is the nominal-verb order.'}

In [27]:
def split_morphs(gloss):
    morphs = gloss[0].split('-')
    glosses = gloss[1].split('-')
    
    if len(morphs) == len(glosses) and len(glosses) > 1:
        return list(zip(morphs, glosses))
    
    return [gloss]

In [28]:
split_morphs(examples[48]['glosses'][4])

[('kat', 'cut'), ('im', 'TR'), ('bat', 'PROG')]

In [29]:
lang_template = '{lexvo}-x-{glottolog}'
glottolog_template = 'http://glottolog.org/resource/languoid/id/{lang_id}'

for example in examples:
    lang_lexvo = example['language'][2] if example['language'][2] else get_iso_code(example['language'][1])
    lang = lang_template.format(lexvo=lang_lexvo, glottolog=example['language'][1]) if lang_lexvo else ''
    
    # Utterance node
    ex = apics + URIRef('ex_{}'.format(example['id']))
    g.add((apics.examples, ligt.subSegment, ex))
    
    # Utterance properties
    g.add((ex, RDF.type, ligt.Utterance))
    g.add((ex, OWL.sameAs, URIRef(example['orig_id'])))
    g.add((ex, RDFS.label, Literal(example['baseline'], lang=lang)))
    if example['comment']:
        g.add((ex, RDFS.comment, Literal(example['comment'], lang="en")))
    g.add((ex, ligt.translation, Literal(example['translation'], lang="en")))
    
    # Utterance metadata
    g.add((ex, DCTERMS.language, URIRef(glottolog_template.format(lang_id=example['language'][1]))))
    
    # Tiers
    ex_tier_phrase = URIRef('{}_tier_phrase'.format(ex))
    ex_tier_morphs = URIRef('{}_tier_morphs'.format(ex))
    ex_tier_words = URIRef('{}_tier_words'.format(ex))
    
    g.add((ex, ligt.hasTier, ex_tier_phrase))
    g.add((ex, ligt.hasMorphs, ex_tier_morphs))
    g.add((ex, ligt.hasWords, ex_tier_words))
    
    # Phrase
    phrase = URIRef('{}_item_phrase_1'.format(ex))
    g.add((ex_tier_phrase, RDF.type, ligt.Tier))
    g.add((ex_tier_phrase, ligt.item, phrase))
    
    # Glosses
    
    if len(example['glosses']):
        next_word = URIRef('{}_item_word_{}'.format(ex, 1))
        
    for i, gloss in enumerate(example['glosses']):
        word = next_word
        next_word = URIRef('{}_item_word_{}'.format(ex, i+2)) if i < len(example['glosses']) - 1 else None
        
        g.add((ex_tier_words, ligt.item, word))
        g.add((word, RDF.type, ligt.Word))
        g.add((word, nif.subString, phrase))
        g.add((word, RDFS.label, Literal(gloss[0].strip('\\.,'), lang=lang)))
        
        if next_word:
            g.add((word, ligt.next, next_word))
        
        next_morph = URIRef('{}_item_morph_{}_{}'.format(ex, i+1, 1))
        subglosses = split_morphs(gloss)
        
        for j, subgloss in enumerate(subglosses):
            morph = next_morph
            next_morph = URIRef('{}_item_morph_{}_{}'.format(ex, i+1, j+2)) if j < len(subglosses) - 1 else None
            
            g.add((ex_tier_morphs, ligt.item, morph))
            g.add((morph, RDF.type, ligt.Morph))
            g.add((morph, nif.subString, word))
            g.add((morph, RDFS.label, Literal(subgloss[0].strip('\\.,'), lang=lang)))
            g.add((morph, ligt.gloss, Literal(subgloss[1].strip('\\.,'), lang="en")))
            
            if next_morph:
                g.add((morph, ligt.next, next_morph))

In [30]:
g.serialize(format='turtle', destination='../apics_ligt.ttl', encoding='utf-8')

## Mapping

We will use the list of labels from the dataset:

In [34]:
gloss_abbr = {}

with open('glossabbreviations.csv') as inp_file:
    inp_file.readline()
    for line in inp_file:
        tag, val = line.strip('\n\r,').split(',', 1)
        gloss_abbr[tag] = val

In [35]:
gloss_abbr

{'ACC': 'accusative',
 'ADV': 'adverb(ial)',
 'FUT': 'future',
 'BEN': 'benefactive',
 'REL': 'relative',
 'COP': 'copula',
 'NEG': '"negation, negative"',
 'TOP': 'topic',
 'FOC': 'focus',
 'DET': 'determiner',
 'QUOT': 'quotative',
 'VOC': 'vocative',
 'ERG': 'ergative',
 'COND': 'conditional',
 'DEM': 'demonstrative',
 'INTR': 'intransitive',
 'PTCP': 'participle',
 'DIST': 'distal',
 'PASS': 'passive',
 'DU': 'dual',
 'COM': 'comitative',
 'AGR': 'agreement',
 'DEF': 'definite',
 'EXCL': 'exclusive',
 'DAT': 'dative',
 'PRF': 'perfect',
 'IRR': 'irrealis',
 'ANTIP': 'antipassive',
 'IPFV': 'imperfective',
 'CVB': 'converb',
 'SBJV': 'subjunctive',
 'TR': 'transitive',
 'PROG': 'progressive',
 'CAUS': 'causative',
 'RECP': 'reciprocal',
 'APPL': 'applicative',
 'Q': 'question particle/marker',
 'PRS': 'present',
 'RES': 'resultative',
 'INCL': 'inclusive',
 'GEN': 'genitive',
 'PL': 'plural',
 'A': 'agent-like argument of canonical transitive verb',
 'OBL': 'oblique',
 'OBJ': 'objec

### Matching with MMoOn

In [37]:
mmoon = rdflib.Graph()
mmoon.parse('mmoon-core.ttl', format='n3')

<Graph identifier=Na97802aad3ae48a6a91760db37d46c7e (<class 'rdflib.graph.Graph'>)>

In [38]:
MMOON = rdflib.Namespace("http://mmoon.org/core/")

In [39]:
list(mmoon.subject_predicates(rdflib.term.Literal('N-', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))))

[(rdflib.term.URIRef('http://mmoon.org/core/MorphemicGloss_N-'),
  rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'))]

In [41]:
matches_mmoon = {}
for gloss, val in gloss_abbr.items():
    items = list(mmoon.subject_predicates(rdflib.term.Literal(gloss, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))))
    if len(items) > 0:
        matches_mmoon[gloss] = list(mmoon.subjects(URIRef('http://mmoon.org/core/hasAbstractIdentity'), items[0][0]))[0]

In [42]:
matches_mmoon

{'ACC': rdflib.term.URIRef('http://mmoon.org/core/Accusative'),
 'ADV': rdflib.term.URIRef('http://mmoon.org/core/Adverb'),
 'FUT': rdflib.term.URIRef('http://mmoon.org/core/Future'),
 'BEN': rdflib.term.URIRef('http://mmoon.org/core/Benefactive'),
 'COP': rdflib.term.URIRef('http://mmoon.org/core/Copula'),
 'NEG': rdflib.term.URIRef('http://mmoon.org/core/Negation'),
 'DET': rdflib.term.URIRef('http://mmoon.org/core/Determiner'),
 'QUOT': rdflib.term.URIRef('http://mmoon.org/core/Quotative'),
 'VOC': rdflib.term.URIRef('http://mmoon.org/core/Vocative'),
 'ERG': rdflib.term.URIRef('http://mmoon.org/core/Ergative'),
 'COND': rdflib.term.URIRef('http://mmoon.org/core/Conditional'),
 'PASS': rdflib.term.URIRef('http://mmoon.org/core/Passive'),
 'DU': rdflib.term.URIRef('http://mmoon.org/core/Dual'),
 'COM': rdflib.term.URIRef('http://mmoon.org/core/Comitative'),
 'DEF': rdflib.term.URIRef('http://mmoon.org/core/Definite'),
 'EXCL': rdflib.term.URIRef('http://mmoon.org/core/Exclusive'),
 '

### Matching with OLiA

In [43]:
unimorph = rdflib.Graph()
unimorph.parse('olia-unimorph.ttl', format='n3')

<Graph identifier=N5da481b3f493495299f973e7c93086f1 (<class 'rdflib.graph.Graph'>)>

In [44]:
list(unimorph.subject_predicates(rdflib.term.Literal('NOM')))

[(rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#NOM'),
  rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#hasLabel'))]

In [45]:
matches_olia = {}
matches_olia_add = {}
for gloss, val in gloss_abbr.items():
    items = list(unimorph.subject_predicates(rdflib.term.Literal(gloss)))
    if len(items) > 0:
        matches_olia[gloss] = items[0][0]
    else:
        items = list(unimorph.subject_predicates(rdflib.term.Literal(val)))
    if len(items) > 0:
        matches_olia_add[gloss] = items[0][0]

In [46]:
matches_olia_add

{'ACC': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#ACC'),
 'ADV': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#ADV'),
 'FUT': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#FUT'),
 'BEN': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#BEN'),
 'REL': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#REL'),
 'NEG': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#NEG'),
 'TOP': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#TOP'),
 'FOC': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#FOC'),
 'QUOT': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#QUOT'),
 'VOC': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#VOC'),
 'ERG': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#ERG'),
 'COND': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#COND'),
 'INTR': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#INTR'),
 'PTCP': rdflib.term.URIRef('http://purl.org/olia/unimorph.owl#V_PTCP'),
 'DIST': rdflib.term.URI

Number of unique matches in total:

In [49]:
len(set(matches_olia.keys()) | set(matches_mmoon.keys()) | set(matches_olia_add.keys()))

123

Number of matches with OLiA-Unimorph:

In [173]:
len(set(matches_olia.keys()) | set(matches_olia_add.keys()))

81

Number of matches with MMoOn:

In [50]:
len(matches_mmoon)

91

### Adding mappings

In [51]:
hasValue = apics + URIRef('hasValue')
hasValue

rdflib.term.URIRef('http://purl.org/liodi/ligt/apics/hasValue')

In [52]:
for label, uri in matches_mmoon.items():
    g.add((uri, hasValue, Literal(label, lang="en")))

for label, uri in matches_olia.items():
    g.add((uri, hasValue, Literal(label, lang="en")))

for label, uri in matches_olia_add.items():
    g.add((uri, hasValue, Literal(label, lang="en")))

In [53]:
g.serialize(format='turtle', destination='../apics_ligt-mapped.ttl', encoding='utf-8')

Number of still unmatched abbreviations:

In [55]:
len(set(gloss_abbr.keys()) - set(matches_olia.keys()) - set(matches_olia_add.keys()) - set(matches_mmoon.keys()))

144