From 3a9844c6e3b28bf769760b5c67098291b687335c Mon Sep 17 00:00:00 2001 From: Charles Nepote Date: Wed, 7 Sep 2022 16:23:25 +0200 Subject: [PATCH] Fix broken CSV (#2426) Should solve #2426. Script not tested, in itself, but I have done some successful tests with: `perl -ne 'print if /(\xE2\x80\xA8|\xE2\x80\xA9|[\000-\007\013-\037])/' en.openfoodfacts.org.products.csv` I suggest to control in production. --- scripts/export_database.pl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/export_database.pl b/scripts/export_database.pl index 8e0855aead398..d17fb8692987c 100755 --- a/scripts/export_database.pl +++ b/scripts/export_database.pl @@ -78,6 +78,9 @@ ($) # VT (013), FF (014 or \f), CR (015 or \r), etc. # See https://en.wikipedia.org/wiki/ASCII # +# Also replace UTF-8 Line Separator (U+2028) and Paragraph Separator (U+2029): +# \xE2\x80\xA8 and \xE2\x80\xA9 +# # TODO? put it in ProductOpener::Data & use it to control data input and output # Q: Do we have to *always* delete \n? # TODO? Send an email if bad-chars? @@ -85,10 +88,10 @@ sub sanitize_field_content { my $content = (shift(@_) // ""); my $LOG = shift(@_); my $log_msg = (shift(@_) // ""); - if ($content =~ /[\000-\037]/) { + if ($content =~ /(\xE2\x80\xA8|\xE2\x80\xA9|[\000-\037])/) { print $LOG "$log_msg $content\n\n---\n" if (defined $LOG); # TODO? replace the bad char by a space or by nothing? - $content =~ s/[\000-\037]+/ /g; + $content =~ s/(\xE2\x80\xA8|\xE2\x80\xA9|[\000-\037])+/ /g; }; return $content; }