Permalink
Browse files

csv plugin: improve newline detection - closes #1497

  • Loading branch information...
1 parent 91a932d commit fca564614f10002e11afe3ceff6d175be1f5c6ba Dane Springmeyer committed Sep 21, 2012
Showing with 47 additions and 23 deletions.
  1. +2 −0 CHANGELOG.md
  2. +6 −20 plugins/input/csv/csv_datasource.cpp
  3. +39 −3 tests/python_tests/csv_test.py
View
@@ -8,6 +8,8 @@ For a complete change history, see the git log.
## Future
+- Improved detection of newlines in CSV files - now more robust in the face of mixed newline types (#1497)
+
- Allow style level compositing operations to work outside of featureset extents across tiled requests (#1477)
- Support for encoding `literal` postgres types as strings 69fb17cd3/#1466
@@ -172,33 +172,19 @@ void csv_datasource::parse_csv(T & stream,
// autodetect newlines
char newline = '\n';
bool has_newline = false;
- int newline_count = 0;
- int carriage_count = 0;
- for (unsigned idx = 0; idx < file_length_; idx++)
+ for (unsigned lidx = 0; lidx < file_length_ && lidx < 4000; lidx++)
{
char c = static_cast<char>(stream.get());
- if (c == '\n')
+ if (c == '\r')
{
- ++newline_count;
+ newline = '\r';
has_newline = true;
+ break;
}
- else if (c == '\r')
+ if (c == '\n')
{
- ++carriage_count;
has_newline = true;
- }
- // read at least 2000 bytes before testing
- if (idx == file_length_-1 || idx > 4000)
- {
- if (newline_count > carriage_count)
- {
- break;
- }
- else if (carriage_count > newline_count)
- {
- newline = '\r';
- break;
- }
+ break;
}
}
@@ -206,7 +206,7 @@ def test_quoted_numbers(**kwargs):
eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8')
- def test_windows_newlines(**kwargs):
+ def test_reading_windows_newlines(**kwargs):
ds = get_csv_ds('windows_newlines.csv')
eq_(len(ds.fields()),3)
feats = ds.all_features()
@@ -222,8 +222,8 @@ def test_windows_newlines(**kwargs):
eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8')
- def test_mac_newlines(**kwargs):
- ds = get_csv_ds('windows_newlines.csv')
+ def test_reading_mac_newlines(**kwargs):
+ ds = get_csv_ds('mac_newlines.csv')
eq_(len(ds.fields()),3)
feats = ds.all_features()
eq_(len(feats),1)
@@ -238,6 +238,42 @@ def test_mac_newlines(**kwargs):
eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8')
+ def check_newlines(filename):
+ ds = get_csv_ds(filename)
+ eq_(len(ds.fields()),3)
+ feats = ds.all_features()
+ eq_(len(feats),1)
+ fs = ds.featureset()
+ feat = fs.next()
+ eq_(feat['x'],0)
+ eq_(feat['y'],0)
+ eq_(feat['line'],'many\n lines\n of text\n with unix newlines')
+ desc = ds.describe()
+ eq_(desc['geometry_type'],mapnik.DataGeometryType.Point)
+ eq_(desc['name'],'csv')
+ eq_(desc['type'],mapnik.DataType.Vector)
+ eq_(desc['encoding'],'utf-8')
+
+ def test_mixed_mac_unix_newlines(**kwargs):
+ check_newlines('mac_newlines_with_unix_inline.csv')
+
+ def test_mixed_mac_unix_newlines_escaped(**kwargs):
+ check_newlines('mac_newlines_with_unix_inline_escaped.csv')
+
+ # To hard to support this case
+ #def test_mixed_unix_windows_newlines(**kwargs):
+ # check_newlines('unix_newlines_with_windows_inline.csv')
+
+ # To hard to support this case
+ #def test_mixed_unix_windows_newlines_escaped(**kwargs):
+ # check_newlines('unix_newlines_with_windows_inline_escaped.csv')
+
+ def test_mixed_windows_unix_newlines(**kwargs):
+ check_newlines('windows_newlines_with_unix_inline.csv')
+
+ def test_mixed_windows_unix_newlines_escaped(**kwargs):
+ check_newlines('windows_newlines_with_unix_inline_escaped.csv')
+
def test_tabs(**kwargs):
ds = get_csv_ds('tabs_in_csv.csv')
eq_(len(ds.fields()),3)

0 comments on commit fca5646

Please sign in to comment.