Skip to content

Commit

Permalink
Improve mods
Browse files Browse the repository at this point in the history
1. Avoid uppercase after possible abbreviations
2. Avoid removing double punctuation for spanish subtitles
  • Loading branch information
vitiko98 committed Oct 30, 2023
1 parent 386ac22 commit b36b378
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions libs/subzero/modification/mods/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@


ENGLISH = Language("eng")
SPANISH = (Language("spa"), Language("spa", "MX"))


class CommonFixes(SubtitleTextModification):
Expand Down Expand Up @@ -105,12 +106,16 @@ class CommonFixes(SubtitleTextModification):

# uppercase after dot
NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
lambda match: r'%s%s' % (match.group(1), match.group(2).upper()) if len(match.group(1)) > 4 else r"%s%s" % (match.group(1), match.group(2)),
name="CM_uppercase_after_dot"),

# remove double interpunction
NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
name="CM_double_interpunct"),
name="CM_double_interpunct",
# Double interpunction is valid for spanish
# https://www.rae.es/duda-linguistica/es-correcto-combinar-los-signos-de-interrogacion-y-exclamacion
supported=lambda p: p.language not in SPANISH),

# remove spaces before punctuation; don't break spaced ellipses
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"),
Expand Down

0 comments on commit b36b378

Please sign in to comment.