Skip to content

Commit

Permalink
Added specialized Europarl reader
Browse files Browse the repository at this point in the history
Adds a tiny bit of Europarl-specific postprocessing to the raw text
files reader.
  • Loading branch information
markgw committed Jun 12, 2020
1 parent f2f5dc7 commit 3d5a262
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 0 deletions.
Empty file.
79 changes: 79 additions & 0 deletions src/python/pimlico/modules/input/text/europarl/info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

"""
Input reader for raw, unaligned text from Europarl corpus. This does not cover the automatically
aligned versions of the corpus that are typically used for Machine Translation.
The module takes care of a bit of extra processing specific to cleaning up the Europarl data.
.. seealso::
:mod:`~pimlico.modules.input.text.raw_text_files`, which this extends with special postprocessing.
"""
import re

from pimlico.core.modules.inputs import iterable_input_reader
from pimlico.datatypes.corpora import invalid_document
from pimlico.datatypes.corpora.data_points import RawTextDocumentType
from pimlico.modules.input.text.raw_text_files.info import data_ready as raw_data_ready, \
corpus_len as raw_corpus_len, corpus_iter as raw_corpus_iter, ModuleInfo as raw_module_info

language_indicator_re = re.compile(r"\([A-Z]{2}\) ")


def filter_line(line):
"""
Filter applied to each line. It either returns an updated version of the line, or None, in which
case the line is left out altogether.
"""
if line.startswith("<"):
# Simply filter out all lines beginning with '<', which are metadata
return None

# Some metadata-like text is also included at the start of lines, followed by ". - "
if ". - " in line:
__, __, line = line.partition(". - ")

# Remove -s and spaces from the start of lines
# Not sure why they're often there, but it's just how the transcripts were formatted
line = line.lstrip("- ")

# Skip lines that are fully surrounded by brackets: they're typically descriptions of what happened
# E.g. (Applause)
if line.startswith("(") and line.endswith(")"):
return None

# It's common for a speaker's first utterance to start with a marker indicating the original language
line = language_indicator_re.sub("", line)
return line


def filter_text(doc):
text = doc.text
lines = [filter_line(line) for line in text.splitlines()]
lines = [line for line in lines if line is not None]
# If a document has only a couple of lines in it, it's not a debate transcript, but just a heading
# Better to skip these
if len(lines) < 4:
return invalid_document("europarl_reader",
error_info="Too few lines in document: not a debate transcript")
return doc.data_point_type(text="\n".join(lines))


def corpus_iter(reader):
# Use the raw text files iterator and just post-process the docs
for doc_name, doc in raw_corpus_iter(reader):
yield doc_name, filter_text(doc)


ModuleInfo = iterable_input_reader(
raw_module_info.module_options,
RawTextDocumentType(),
raw_data_ready, raw_corpus_len, corpus_iter,
module_type_name="europarl_reader",
module_readable_name="Europarl corpus reader",
)
11 changes: 11 additions & 0 deletions test/data/pipelines/input/europarl.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[pipeline]
name=europarl
release=latest

# Read in some Europarl raw files, using the special Europarl reader
[europarl]
type=pimlico.modules.input.text.europarl
files=%(test_data_dir)s/datasets/europarl_en_raw/*

[store]
type=pimlico.modules.corpora.store
3 changes: 3 additions & 0 deletions test/data/pipelines/input/raw_text_files.conf
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ release=latest
[europarl]
type=pimlico.modules.input.text.raw_text_files
files=%(test_data_dir)s/datasets/europarl_en_raw/*

[store]
type=pimlico.modules.corpora.store

0 comments on commit 3d5a262

Please sign in to comment.