Skip to content

Commit

Permalink
csv plugin: improve newline detection - closes #1497
Browse files Browse the repository at this point in the history
  • Loading branch information
Dane Springmeyer committed Sep 21, 2012
1 parent 91a932d commit fca5646
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 23 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ For a complete change history, see the git log.


## Future ## Future


- Improved detection of newlines in CSV files - now more robust in the face of mixed newline types (#1497)

- Allow style level compositing operations to work outside of featureset extents across tiled requests (#1477) - Allow style level compositing operations to work outside of featureset extents across tiled requests (#1477)


- Support for encoding `literal` postgres types as strings 69fb17cd3/#1466 - Support for encoding `literal` postgres types as strings 69fb17cd3/#1466
Expand Down
26 changes: 6 additions & 20 deletions plugins/input/csv/csv_datasource.cpp
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -172,33 +172,19 @@ void csv_datasource::parse_csv(T & stream,
// autodetect newlines // autodetect newlines
char newline = '\n'; char newline = '\n';
bool has_newline = false; bool has_newline = false;
int newline_count = 0; for (unsigned lidx = 0; lidx < file_length_ && lidx < 4000; lidx++)
int carriage_count = 0;
for (unsigned idx = 0; idx < file_length_; idx++)
{ {
char c = static_cast<char>(stream.get()); char c = static_cast<char>(stream.get());
if (c == '\n') if (c == '\r')
{ {
++newline_count; newline = '\r';
has_newline = true; has_newline = true;
break;
} }
else if (c == '\r') if (c == '\n')
{ {
++carriage_count;
has_newline = true; has_newline = true;
} break;
// read at least 2000 bytes before testing
if (idx == file_length_-1 || idx > 4000)
{
if (newline_count > carriage_count)
{
break;
}
else if (carriage_count > newline_count)
{
newline = '\r';
break;
}
} }
} }


Expand Down
42 changes: 39 additions & 3 deletions tests/python_tests/csv_test.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def test_quoted_numbers(**kwargs):
eq_(desc['type'],mapnik.DataType.Vector) eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8') eq_(desc['encoding'],'utf-8')


def test_windows_newlines(**kwargs): def test_reading_windows_newlines(**kwargs):
ds = get_csv_ds('windows_newlines.csv') ds = get_csv_ds('windows_newlines.csv')
eq_(len(ds.fields()),3) eq_(len(ds.fields()),3)
feats = ds.all_features() feats = ds.all_features()
Expand All @@ -222,8 +222,8 @@ def test_windows_newlines(**kwargs):
eq_(desc['type'],mapnik.DataType.Vector) eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8') eq_(desc['encoding'],'utf-8')


def test_mac_newlines(**kwargs): def test_reading_mac_newlines(**kwargs):
ds = get_csv_ds('windows_newlines.csv') ds = get_csv_ds('mac_newlines.csv')
eq_(len(ds.fields()),3) eq_(len(ds.fields()),3)
feats = ds.all_features() feats = ds.all_features()
eq_(len(feats),1) eq_(len(feats),1)
Expand All @@ -238,6 +238,42 @@ def test_mac_newlines(**kwargs):
eq_(desc['type'],mapnik.DataType.Vector) eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8') eq_(desc['encoding'],'utf-8')


def check_newlines(filename):
ds = get_csv_ds(filename)
eq_(len(ds.fields()),3)
feats = ds.all_features()
eq_(len(feats),1)
fs = ds.featureset()
feat = fs.next()
eq_(feat['x'],0)
eq_(feat['y'],0)
eq_(feat['line'],'many\n lines\n of text\n with unix newlines')
desc = ds.describe()
eq_(desc['geometry_type'],mapnik.DataGeometryType.Point)
eq_(desc['name'],'csv')
eq_(desc['type'],mapnik.DataType.Vector)
eq_(desc['encoding'],'utf-8')

def test_mixed_mac_unix_newlines(**kwargs):
check_newlines('mac_newlines_with_unix_inline.csv')

def test_mixed_mac_unix_newlines_escaped(**kwargs):
check_newlines('mac_newlines_with_unix_inline_escaped.csv')

# To hard to support this case
#def test_mixed_unix_windows_newlines(**kwargs):
# check_newlines('unix_newlines_with_windows_inline.csv')

# To hard to support this case
#def test_mixed_unix_windows_newlines_escaped(**kwargs):
# check_newlines('unix_newlines_with_windows_inline_escaped.csv')

def test_mixed_windows_unix_newlines(**kwargs):
check_newlines('windows_newlines_with_unix_inline.csv')

def test_mixed_windows_unix_newlines_escaped(**kwargs):
check_newlines('windows_newlines_with_unix_inline_escaped.csv')

def test_tabs(**kwargs): def test_tabs(**kwargs):
ds = get_csv_ds('tabs_in_csv.csv') ds = get_csv_ds('tabs_in_csv.csv')
eq_(len(ds.fields()),3) eq_(len(ds.fields()),3)
Expand Down

0 comments on commit fca5646

Please sign in to comment.