Skip to content

Commit

Permalink
feat(csv): support forcing dialect and skipping automatic detection
Browse files Browse the repository at this point in the history
This is useful in situations the dialect is known like in
WeblateOrg/weblate#11872
  • Loading branch information
nijel committed Jun 18, 2024
1 parent 4016f9a commit bfa290f
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 9 deletions.
26 changes: 26 additions & 0 deletions tests/translate/storage/test_csvl10n.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,32 @@ def test_utf_8(self):
assert store.units[0].source == "test"
assert store.units[0].target == "zkouška sirén"

def test_dialect(self):
payload = '"location","source","target"\r\n"foo.c:1","test","zkouška sirén"\r\n'.encode()
store = self.StoreClass()
store.parse(payload)
assert len(store.units) == 1
assert store.units[0].source == "test"
assert store.units[0].target == "zkouška sirén"

store = self.StoreClass()
store.parse(payload, dialect="excel")
assert len(store.units) == 1
assert store.units[0].source == "test"
assert store.units[0].target == "zkouška sirén"

store = self.StoreClass()
store.parse(payload, dialect="unix")
assert len(store.units) == 1
assert store.units[0].source == "test"
assert store.units[0].target == "zkouška sirén"

store = self.StoreClass()
store.parse(payload, dialect="default")
assert len(store.units) == 1
assert store.units[0].source == "test"
assert store.units[0].target == "zkouška sirén"

def test_utf_8_sig(self):
content = '"location";"source";"target"\r\n"foo.c:1";"test";"zkouška sirén"\r\n'.encode(
"utf-8-sig"
Expand Down
25 changes: 16 additions & 9 deletions translate/storage/csvl10n.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
or entire files (csvfile) for use with localisation.
"""

from __future__ import annotations

import csv

from translate.storage import base
Expand Down Expand Up @@ -291,7 +293,9 @@ def __init__(self, inputfile=None, fieldnames=None, encoding="auto"):
inputfile.close()
self.parse(csvsrc)

def parse(self, csvsrc, sample_length=1024):
def parse(
self, csvsrc, sample_length: int | None = 1024, *, dialect: None | str = None
):
if self._encoding == "auto":
text, encoding = self.detect_encoding(
csvsrc, default_encodings=["utf-8", "utf-16"]
Expand All @@ -305,14 +309,17 @@ def parse(self, csvsrc, sample_length=1024):
sniffer = csv.Sniffer()
sample = text[:sample_length] if sample_length else text

try:
self.dialect = sniffer.sniff(sample)
if self.dialect.quoting == csv.QUOTE_MINIMAL:
# HACKISH: most probably a default, not real detection
self.dialect.quoting = csv.QUOTE_ALL
self.dialect.doublequote = True
except csv.Error:
self.dialect = "default"
if dialect is not None:
self.dialect = dialect
else:
try:
self.dialect = sniffer.sniff(sample)
if self.dialect.quoting == csv.QUOTE_MINIMAL:
# HACKISH: most probably a default, not real detection
self.dialect.quoting = csv.QUOTE_ALL
self.dialect.doublequote = True
except csv.Error:
self.dialect = "default"

inputfile = csv.StringIO(text)
try:
Expand Down

0 comments on commit bfa290f

Please sign in to comment.