Merge pull request #1 from neocl/dev

Initial release
neocl · Apr 28, 2021 · b75818b · b75818b
2 parents d7014c9 + b2c8807
commit b75818b
Show file tree

Hide file tree

Showing 46 changed files with 4,045 additions and 3 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2021 New Computational Linguists
+Copyright (c) 2018 Le Tuan Anh <tuananh.ke@gmail.com>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+include README.md
+include requirements*.txt
+recursive-include speach/data *.sql
diff --git a/README.md b/README.md
@@ -1,5 +1,85 @@
 # speach
 
-Managing, annotating, and converting natural language corpuses using popular formats (CoNLL, ELAN, Praat, CSV, JSON, SQLite, VTT, Audacity, TTL, TIG, ISF)
+[![ReadTheDocs Badge](https://readthedocs.org/projects/speach/badge/?version=latest&style=plastic)](https://speach.readthedocs.io/)
+[![Total alerts](https://img.shields.io/lgtm/alerts/g/letuananh/speach.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/letuananh/speach/alerts/)
+[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/letuananh/speach.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/letuananh/speach/context:python)
 
-Formerly: [texttaglib](https://pypi.org/project/texttaglib/)
+Speach (formerly [speach](https://pypi.org/project/speach/)), is a Python 3 library for managing, annotating, and converting natural language corpuses using popular formats (CoNLL, ELAN, Praat, CSV, JSON, SQLite, VTT, Audacity, TTL, TIG, ISF, etc.)
+
+Main functions are:
+
+- Text corpus management
+- Manipuling [ELAN](https://archive.mpi.nl/tla/elan/download>) transcription files directly in ELAN Annotation Format (eaf)
+- TIG - A human-friendly intelinear gloss format for linguistic documentation
+- Multiple storage formats (text, CSV, JSON, SQLite databases)
+
+## Useful Links
+
+- Speach documentation: https://speach.readthedocs.io/
+- Soure code: https://github.com/neocl/speach/
+
+## Installation
+
+Speach is availble on [PyPI](https://pypi.org/project/speach/).
+
+```bash
+pip install speach
+```
+
+## ELAN support
+
+speach library contains a command line tool for converting EAF files into CSV.
+
+```bash
+python -m speach eaf2csv input_elan_file.eaf -o output_file_name.csv
+```
+
+For more complex analyses, speach Python scripts can be used to extract metadata and annotations from ELAN transcripts, for example:
+
+``` python
+from speach import elan
+
+# Test ELAN reader function in speach
+eaf = elan.open_eaf('./test/data/test.eaf')
+
+# accessing metadata
+print(f"Author: {eaf.author} | Date: {eaf.date} | Format: {eaf.fileformat} | Version: {eaf.version}")
+print(f"Media file: {eaf.media_file}")
+print(f"Time units: {eaf.time_units}")
+print(f"Media URL: {eaf.media_url} | MIME type: {eaf.mime_type}")
+print(f"Media relative URL: {eaf.relative_media_url}")
+
+# accessing tiers & annotations
+for tier in eaf.tiers():
+    print(f"{tier.ID} | Participant: {tier.participant} | Type: {tier.type_ref}")
+    for ann in tier.annotations:
+        print(f"{ann.ID.rjust(4, ' ')}. [{ann.from_ts.ts} -- {ann.to_ts.ts}] {ann.value}")
+```
+
+## Text corpus
+
+```python
+>>> from speach import ttl
+>>> doc = ttl.Document('mydoc')
+>>> sent = doc.new_sent("I am a sentence.")
+>>> sent
+#1: I am a sentence.
+>>> sent.ID
+1
+>>> sent.text
+'I am a sentence.'
+>>> sent.import_tokens(["I", "am", "a", "sentence", "."])
+>>> >>> sent.tokens
+[`I`<0:1>, `am`<2:4>, `a`<5:6>, `sentence`<7:15>, `.`<15:16>]
+>>> doc.write_ttl()
+```
+
+The script above will generate this corpus
+
+```
+-rw-rw-r--.  1 tuananh tuananh       0  3月 29 13:10 mydoc_concepts.txt
+-rw-rw-r--.  1 tuananh tuananh       0  3月 29 13:10 mydoc_links.txt
+-rw-rw-r--.  1 tuananh tuananh      20  3月 29 13:10 mydoc_sents.txt
+-rw-rw-r--.  1 tuananh tuananh       0  3月 29 13:10 mydoc_tags.txt
+-rw-rw-r--.  1 tuananh tuananh      58  3月 29 13:10 mydoc_tokens.txt
+```
diff --git a/cov.sh b/cov.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+python3 -m coverage run --source texttaglib --branch -m unittest discover -s test
+python3 -m coverage html
+
diff --git a/demo.py b/demo.py
@@ -0,0 +1,68 @@
+import nltk
+from speach import ttl
+from speach.sqlite import TTLSQLite
+
+
+# ------------------------------------------------------------------------------
+# Helper functions
+# ------------------------------------------------------------------------------
+
+def dump_sent(sent):
+    ''' Print a sentence to console '''
+    print("Raw: {}".format(sent.text))
+    print("Tokens: {}".format(sent.tokens))
+    print("Concepts: {}".format(sent.concepts))
+    for c in sent.concepts:
+        print("    > {}".format(c))
+    print(sent.to_json())
+
+
+# ------------------------------------------------------------------------------
+# Demo script
+# ------------------------------------------------------------------------------
+
+# create a TTL database
+db = TTLSQLite('data/demo.db')
+
+# create a sample corpus (if needed)
+encor = db.ensure_corpus(name='eng', title='English sentences')
+# create a sample document in corpus 'eng' (if needed)
+endoc = db.ensure_doc(name='eng1', title='English sample sentences #1', lang='eng', corpus=encor)
+
+# get document by name
+doc = db.doc.select_single('name=?', ('eng1',))
+# if the document is empty, create a sample sentence inside
+if not db.sent.select('docID=?', (doc.ID,)):
+    sent = ttl.Sentence("I am a short sentence.")
+    # tokenize the sentence with NLTK tokenizer
+    tokens = nltk.word_tokenize(sent.text)
+    sent.import_tokens(tokens)
+    # add concepts
+    sent.new_concept('01436003-a', 'short', tokens=[3])
+    sent.new_concept('06285090-n', 'sentence', tokens=[4])
+    # comment on sentences
+    sent.comment = 'This is just an example to demonstrate how to use TTL.'
+    # print it out
+    dump_sent(sent)
+    # save it to document 'eng1'
+    sent.docID = doc.ID
+    db.save_sent(sent)
+
+    # create a second sentence with MWE
+    calico_text = 'I like calico cat.'
+    calico_cat_synset = '02123242-n'
+    if not db.sent.select('text = ?', (calico_text,)):
+        sent = ttl.Sentence(calico_text)
+        sent.new_tag('三毛猫が好きです。', tagtype='jpn')
+        sent.import_tokens(nltk.word_tokenize(sent.text))
+        # create concepts
+        sent.new_concept('01777210-v', 'like', tokens=[1])
+        sent.new_concept(calico_cat_synset, 'calico cat', tokens=[2, 3])  # MWE -> tokens=[2,3]
+        sent[2].new_tag('+', tagtype='MWE')
+        sent[3].new_tag('+', tagtype='MWE')
+        dump_sent(sent)
+        # save it to database
+        sent.docID = doc.ID
+        db.save_sent(sent)
+
+print("Done!")
diff --git a/demo_batch_processing_eaf_files.py b/demo_batch_processing_eaf_files.py
@@ -0,0 +1,23 @@
+from pathlib import Path
+from speach import elan
+
+
+transcript_folder = Path('./test/data/')
+csv_data = []
+for child_file in transcript_folder.iterdir():
+    if child_file.suffix.endswith('.eaf'):
+        print(child_file.name)
+        c = 0
+        eaf = elan.open_eaf(child_file)
+        for tier in eaf.roots:
+            if tier.type_ref == 'Utterance':
+                print(f"  | {tier.ID} | Participant: {tier.participant} | Type: {tier.type_ref}")
+                for ann in tier.annotations:
+                    if 'BABYNAME' in ann.value:
+                        c += 1
+                        print(f"  | -- {tier.ID} --> {tier.participant}: {ann.value}")
+        print(c)
+        csv_data.append((child_file.name, c))
+
+for fn, c in csv_data:
+    print(f"{fn}\t{c}")
diff --git a/demo_elan.py b/demo_elan.py
@@ -0,0 +1,29 @@
+from speach import elan
+
+# read an ELAN file
+eaf = elan.open_eaf('./test/data/test.eaf')
+
+# accessing metadata
+print(f"Author: {eaf.author} | Date: {eaf.date} | Format: {eaf.fileformat} | Version: {eaf.version}")
+print(f"Media file: {eaf.media_file}")
+print(f"Time units: {eaf.time_units}")
+print(f"Media URL: {eaf.media_url} | MIME type: {eaf.mime_type}")
+print(f"Media relative URL: {eaf.relative_media_url}")
+
+# accessing tiers & annotations
+for tier in eaf.tiers():
+    print(f"{tier.ID} | Participant: {tier.participant} | Type: {tier.type_ref}")
+    for ann in tier.annotations:
+        print(f"{ann.ID.rjust(4, ' ')}. [{ann.from_ts.ts} -- {ann.to_ts.ts}] {ann.value}")
+
+
+# test parsing EAF files with nested tiers
+elan2 = elan.open_eaf('./data/test_nested.eaf')
+# accessing nested tiers
+for tier in eaf.roots:
+    print(f"{tier.ID} | Participant: {tier.participant} | Type: {tier.type_ref}")
+    print(f"  -- {ann.ID.rjust(4, ' ')}. [{ann.from_ts.ts} -- {ann.to_ts.ts}] {ann.value}")    
+    for child_tier in tier.children:
+        print(f"    | {child_tier.ID} | Participant: {child_tier.participant} | Type: {child_tier.type_ref}")
+        for ann in child_tier.annotations:
+            print(f"    |- {ann.ID.rjust(4, ' ')}. [{ann.from_ts.ts} -- {ann.to_ts.ts}] {ann.value}")
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,24 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+
+serve:
+	python -m http.server 7000 --bind 127.0.0.1 --directory ${BUILDDIR}/dirhtml
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/api.rst b/docs/api.rst
@@ -0,0 +1,42 @@
+Speach APIs
+===============
+
+An overview of ``speach`` modules.
+
+.. module:: speach
+
+ELAN supports
+-------------
+
+speach supports reading and manipulating multi-tier transcriptions from ELAN directly.
+
+.. automodule:: speach.elan
+   :members: open_eaf, parse_eaf_stream
+
+.. autoclass:: ELANDoc
+   :members:
+   :member-order: groupwise
+
+.. autoclass:: ELANTier
+   :members:
+   :member-order: groupwise
+
+TTL Interlinear Gloss Format
+----------------------------
+
+TTLIG is a human friendly interlinear gloss format that can be edited using any text editor.
+
+.. module:: speach.ttlig
+
+TTL SQLite
+----------
+
+TTL supports SQLite storage format to manage large scale corpuses.
+
+.. module:: speach.sqlite
+
+WebVTT
+------
+
+Speach supports manipulating Web Video Text Tracks format (Web VTT).
+Read more in :ref:`page_vtt` page.
diff --git a/docs/api_vtt.rst b/docs/api_vtt.rst
@@ -0,0 +1,13 @@
+.. _page_vtt:
+
+Web VTT APIs
+============
+
+Speach supports Web VTT - The Web Video Text Tracks Format.
+Read more about it at: https://www.w3.org/2013/07/webvtt.html
+
+APIs
+----
+
+.. automodule:: speach.vtt
+   :members: sec2ts, ts2sec
diff --git a/docs/conf.py b/docs/conf.py
@@ -0,0 +1,53 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'speach'
+copyright = '2018, Le Tuan Anh <tuananh.ke@gmail.com>'
+author = 'Le Tuan Anh'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.doctest']
+# -- Highlight code block -----------------
+pygments_style = 'sphinx'
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']