Skip to content

Commit

Permalink
Merge pull request #15 from letuananh/dev
Browse files Browse the repository at this point in the history
rename ttlig to tig
  • Loading branch information
letuananh committed May 2, 2021
2 parents 8a73733 + 2dc41dd commit 628dbe0
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 67 deletions.
4 changes: 2 additions & 2 deletions speach/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-

'''
"""
speach - a Python library for managing, annotating, and converting natural language corpuses
using popular formats (CoNLL, ELAN, Praat, CSV, JSON, SQLite, VTT, Audacity, TTL, TIG, ISF)
'''
"""

# This code is a part of speach library: https://github.com/neocl/speach/
# :copyright: (c) 2018 Le Tuan Anh <tuananh.ke@gmail.com>
Expand Down
12 changes: 6 additions & 6 deletions speach/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from chirptext import chio
from chirptext.cli import CLIApp, setup_logging

from speach import ttl, TTLSQLite, ttlig, orgmode
from speach import ttl, TTLSQLite, tig, orgmode
from speach.elan import parse_eaf_stream

# ----------------------------------------------------------------------
Expand Down Expand Up @@ -47,7 +47,7 @@ def get_doc_length(name, ctx):


def make_db(cli, args):
''' Convert TTL-TXT to TTL-SQLite '''
""" Convert TTL-TXT to TTL-SQLite """
print("Reading document ...")
ttl_doc = ttl.Document.read_ttl(args.ttl)
print("Sentences: {}".format(len(ttl_doc)))
Expand Down Expand Up @@ -78,7 +78,7 @@ def process_tig(cli, args):
sc = 0
ttl_writer = ttl.TxtWriter.from_path(args.output) if args.output else None
with chio.open(args.ttlig) as infile:
for sent in ttlig.read_stream_iter(infile):
for sent in tig.read_stream_iter(infile):
sc += 1
if ttl_writer is not None:
ttl_sent = sent.to_ttl()
Expand All @@ -94,7 +94,7 @@ def process_tig(cli, args):
output.print()
output.print()
with chio.open(args.ttlig) as infile:
for idx, sent in enumerate(ttlig.read_stream_iter(infile)):
for idx, sent in enumerate(tig.read_stream_iter(infile)):
sc += 1
output.print(sent.to_expex(default_ident=idx + 1))
output.print()
Expand All @@ -105,7 +105,7 @@ def process_tig(cli, args):


def jp_line_proc(line, iglines):
igrow = ttlig.text_to_igrow(line.replace('\u3000', ' ').strip())
igrow = tig.text_to_igrow(line.replace('\u3000', ' ').strip())
iglines.append(igrow.text)
iglines.append(igrow.tokens)
iglines.append("")
Expand Down Expand Up @@ -153,7 +153,7 @@ def make_text(sent, delimiter=' '):
for tk in sent:
furi = tk.find('furi', default=None)
if furi:
frags.append(ttlig.make_ruby_html(furi.label))
frags.append(tig.make_ruby_html(furi.label))
else:
frags.append(tk.text)
html_text = delimiter.join(frags) if frags else sent.text
Expand Down
41 changes: 17 additions & 24 deletions speach/ttlig.py → speach/tig.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

'''
"""
TTL Interlinear Gloss (TIG) format support
More information: https://en.wikipedia.org/wiki/Interlinear_gloss
Expand All @@ -11,7 +11,7 @@
a morphophonemic transliteration,
a word-by-word or morpheme-by-morpheme gloss, where morphemes within a word are separated by hyphens or other punctuation,
a free translation, which may be placed in a separate paragraph or on the facing page if the structures of the languages are too different for it to follow the text line by line.
'''
"""
# This code is a part of speach library: https://github.com/neocl/speach/
# :copyright: (c) 2018 Le Tuan Anh <tuananh.ke@gmail.com>
# :license: MIT, see LICENSE for more details.
Expand All @@ -29,15 +29,6 @@
from chirptext import ttl



# ----------------------------------------------------------------------
# Configuration
# ----------------------------------------------------------------------

def getLogger():
return logging.getLogger(__name__)


# ----------------------------------------------------------------------
# Models
# ----------------------------------------------------------------------
Expand All @@ -48,8 +39,10 @@ def getLogger():
# a conventional transliteration into the Latin alphabet,
# a phonetic transcription,
# a morphophonemic transliteration,
# a word-by-word or morpheme-by-morpheme gloss, where morphemes within a word are separated by hyphens or other punctuation,
# a free translation, which may be placed in a separate paragraph or on the facing page if the structures of the languages are too different for it to follow the text line by line.
# a word-by-word or morpheme-by-morpheme gloss, where morphemes within
# a word are separated by hyphens or other punctuation,
# a free translation, which may be placed in a separate paragraph or on the facing page
# if the structures of the languages are too different for it to follow the text line by line.
class IGRow(DataObject):
def __init__(self, text='', transliteration='', transcription='', morphtrans='', morphgloss='', wordgloss='', translation='', **kwargs):
"""
Expand Down Expand Up @@ -79,35 +72,35 @@ def to_ttl(self):
if self.morphtrans:
_morphtokens = tokenize(self.morphtrans)
if len(_morphtokens) != len(ttl_sent):
getLogger().warning("Morphophonemic transliteration line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
logging.getLogger(__name__).warning("Morphophonemic transliteration line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
else:
for t, m in zip(ttl_sent, _morphtokens):
t.new_tag(m, tagtype='mtrans')
if self.pos:
_postokens = tokenize(self.pos)
if len(_postokens) != len(ttl_sent):
getLogger().warning("Part-of-speech line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
logging.getLogger(__name__).warning("Part-of-speech line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
else:
for t, m in zip(ttl_sent, _postokens):
t.pos = m
if self.lemma:
_lemmas = tokenize(self.lemma)
if len(_lemmas) != len(ttl_sent):
getLogger().warning("Lemma line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
logging.getLogger(__name__).warning("Lemma line and tokens line are mismatched for sentence: {}".format(self.ident or self.ID or self.Id or self.id or self.text))
else:
for t, m in zip(ttl_sent, _lemmas):
t.lemma = m
if self.morphgloss:
_glosstokens = tokenize(self.morphgloss)
if len(_glosstokens) != len(ttl_sent):
getLogger().warning("morpheme-by-morpheme gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
logging.getLogger(__name__).warning("morpheme-by-morpheme gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
else:
for t, m in zip(ttl_sent, _glosstokens):
t.new_tag(m, tagtype='mgloss')
if self.wordgloss:
_glosstokens = tokenize(self.wordgloss)
if len(_glosstokens) != len(ttl_sent):
getLogger().warning("word-by-word gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
logging.getLogger(__name__).warning("word-by-word gloss and tokens lines are mismatched for sentence {}".format(self.ident or self.ID or self.Id or self.id or self.text))
else:
for t, m in zip(ttl_sent, _glosstokens):
t.new_tag(m, tagtype='wgloss')
Expand All @@ -132,12 +125,12 @@ def to_expex(self, default_ident=''):
if tags:
lengths.append(make_expex_gloss(self.concept, glosses, tags.pop(0)))
else:
getLogger().warning("There are too many gloss lines in sentence {}. {}".format(sent_ident, self.text))
logging.getLogger(__name__).warning("There are too many gloss lines in sentence {}. {}".format(sent_ident, self.text))
# ensure that number of tokens are the same
if len(lengths) > 1:
for line_len in lengths[1:]:
if line_len != lengths[0]:
getLogger().warning("Inconsistent tokens and morphgloss for sentence {}. {} ({} v.s {})".format(sent_ident, self.text, line_len, lengths[0]))
logging.getLogger(__name__).warning("Inconsistent tokens and morphgloss for sentence {}. {} ({} v.s {})".format(sent_ident, self.text, line_len, lengths[0]))
break
lines.extend(glosses)
lines.append('\\glft \lit{{{}}}//'.format(escape_latex(self.text)))
Expand Down Expand Up @@ -259,7 +252,7 @@ def _parse_row(self, line_list, line_tags):
_tag = line[:tag_idx].strip()
_val = line[tag_idx + 1:].lstrip().rstrip('\r\n')
if _tag.lower() not in TTLIG.KNOWN_LABELS:
getLogger().warning("Unknown tag was used ({}): {}".format(_tag, _val))
logging.getLogger(__name__).warning("Unknown tag was used ({}): {}".format(_tag, _val))
line_dict[_tag] = _val
return IGRow(**line_dict)
else:
Expand All @@ -272,7 +265,7 @@ def read_iter(self, stream):
line_tags = self.row_format()
for tag in line_tags:
if tag.lower() not in TTLIG.KNOWN_LABELS + TTLIG.SPECIAL_LABELS:
getLogger().warning("Unknown label in header: {}".format(tag))
logging.getLogger(__name__).warning("Unknown label in header: {}".format(tag))
for row in IGStreamReader._iter_stream(stream):
yield self._parse_row(row, line_tags)

Expand All @@ -297,7 +290,7 @@ def _read_header(ig_stream):
key = m.group('key').strip()
value = m.group('value')
if key in meta:
getLogger().warning("Key {} is duplicated in the header".format(key))
logging.getLogger(__name__).warning("Key {} is duplicated in the header".format(key))
meta[key] = value
else:
# this line is weird
Expand Down Expand Up @@ -497,7 +490,7 @@ def parse(self, text):
if not chars.peep():
raise ValueError("Escape char ({}) cannot be the last character".format(self.escapechar))
elif chars.peep() and chars.peep().value not in (self.escapechar, self.delimiter):
getLogger().warning("Escape char ({}) should not be used for normal character ({}). This can be a potential bug in the data.".format(self.escapechar, chars.peep().value))
logging.getLogger(__name__).warning("Escape char ({}) should not be used for normal character ({}). This can be a potential bug in the data.".format(self.escapechar, chars.peep().value))
is_escaping = True
elif c == self.delimiter:
# flush
Expand Down

0 comments on commit 628dbe0

Please sign in to comment.