From 60640bb95ac255d53bee635e20af1e9a493ed94f Mon Sep 17 00:00:00 2001 From: Christopher Groskopf Date: Fri, 13 Apr 2012 08:37:10 -0500 Subject: [PATCH] Intelligently handle currency symbols in int and float columns. Closes #514 --- panda/tests/__init__.py | 2 +- panda/tests/test_dataset.py | 23 ++++++++++++++ panda/tests/test_utils.py | 61 +++++++++++++++++++++++++++++++++++++ panda/tests/utils.py | 1 + panda/utils/typecoercion.py | 16 ++++++++++ test_data/test_money.csv | 9 ++++++ 6 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 test_data/test_money.csv diff --git a/panda/tests/__init__.py b/panda/tests/__init__.py index 77f528a..798d469 100644 --- a/panda/tests/__init__.py +++ b/panda/tests/__init__.py @@ -18,6 +18,6 @@ from panda.tests.test_solr import TestSolrJSONEncoder from panda.tests.test_related_upload import TestRelatedUpload from panda.tests.test_user import TestUser -from panda.tests.test_utils import TestCSV, TestXLS, TestXLSX +from panda.tests.test_utils import TestCSV, TestXLS, TestXLSX, TestTypeCoercion from panda.tests.test_views import TestLogin, TestActivate diff --git a/panda/tests/test_dataset.py b/panda/tests/test_dataset.py index 0fa0296..ebf11ec 100644 --- a/panda/tests/test_dataset.py +++ b/panda/tests/test_dataset.py @@ -575,3 +575,26 @@ def test_generate_typed_column_names_conflict(self): self.assertEqual([c['indexed_name'] for c in self.dataset.column_schema], ['column_int_test', None, 'column_unicode_test', 'column_unicode_test2']) + def test_reindex_with_currency(self): + upload = utils.get_test_data_upload(self.user, self.dataset, filename=utils.TEST_MONEY) + self.dataset.import_data(self.user, upload) + + # Refresh from database + dataset = Dataset.objects.get(id=self.dataset.id) + + dataset.reindex_data(self.user, typed_columns=[False, True], column_types=['unicode', 'float']) + + # Refresh from database + dataset = Dataset.objects.get(id=self.dataset.id) + + self.assertEqual([c['name'] for c in dataset.column_schema], ['product', 'price']) + self.assertEqual([c['type'] for c in dataset.column_schema], ['unicode', 'float']) + self.assertEqual([c['indexed'] for c in dataset.column_schema], [False, True]) + self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, 'column_float_price']) + self.assertEqual([c['min'] for c in dataset.column_schema], [None, 39.99]) + self.assertEqual([c['max'] for c in dataset.column_schema], [None, 2599.00]) + + self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:39.99')['response']['numFound'], 2) + self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:[1500 TO *]')['response']['numFound'], 2) + self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:*')['response']['numFound'], 8) + diff --git a/panda/tests/test_utils.py b/panda/tests/test_utils.py index 95e9e19..5127839 100644 --- a/panda/tests/test_utils.py +++ b/panda/tests/test_utils.py @@ -1,10 +1,12 @@ #!/usr/bin/env python +from datetime import date, time, datetime import os.path from django.test import TestCase from panda import utils +from panda.exceptions import TypeCoercionError from panda.tests import utils as test_utils class TestCSV(TestCase): @@ -114,3 +116,62 @@ def test_xlsx_guess_column_types(self): self.assertEqual(guessed_types, ['unicode', 'date', 'int', 'bool', 'float', 'time', 'datetime', None, 'unicode']) +class TestTypeCoercion(TestCase): + def setUp(self): + self.data_typer = utils.typecoercion.DataTyper([]) + self.coerce_type = self.data_typer.coerce_type + + def test_coerce_nulls(self): + self.assertEqual(self.coerce_type(None, bool), None) + self.assertEqual(self.coerce_type('N/A', int), None) + self.assertEqual(self.coerce_type('n/a', datetime), None) + + def test_coerce_int_from_str(self): + self.assertEqual(self.coerce_type('171', int), 171) + + def test_coerce_int_from_str_fails(self): + with self.assertRaises(TypeCoercionError): + self.assertEqual(self.coerce_type('#171', int), 171) + + def test_coerce_int_from_unicode(self): + self.assertEqual(self.coerce_type(u'171', int), 171) + + def test_coerce_int_from_currency_str(self): + self.assertEqual(self.coerce_type('$171,000', int), 171000) + + def test_coerce_int_from_currency_float(self): + self.assertEqual(self.coerce_type(u'$171,000', int), 171000) + + def test_coerce_float_from_str(self): + self.assertEqual(self.coerce_type('171.59', float), 171.59) + + def test_coerce_float_from_unicode(self): + self.assertEqual(self.coerce_type(u'171.59', float), 171.59) + + def test_coerce_float_from_currency_str(self): + self.assertEqual(self.coerce_type('$171,000.59', float), 171000.59) + + def test_coerce_float_from_currency_float(self): + self.assertEqual(self.coerce_type(u'$171,000.59', float), 171000.59) + + def test_coerce_bool_from_str(self): + self.assertEqual(self.coerce_type('True', bool), True) + self.assertEqual(self.coerce_type('true', bool), True) + self.assertEqual(self.coerce_type('T', bool), True) + self.assertEqual(self.coerce_type('yes', bool), True) + + def test_coerce_bool_from_unicode(self): + self.assertEqual(self.coerce_type(u'True', bool), True) + self.assertEqual(self.coerce_type(u'true', bool), True) + self.assertEqual(self.coerce_type(u'T', bool), True) + self.assertEqual(self.coerce_type(u'yes', bool), True) + + def test_coerce_datetime_from_str(self): + self.assertEqual(self.coerce_type('2011-4-13 8:28 AM', datetime), datetime(2011, 4, 13, 8, 28, 0)) + + def test_coerce_date_from_str(self): + self.assertEqual(self.coerce_type('2011-4-13', date), datetime(2011, 4, 13, 0, 0, 0)) + + def test_coerce_time_from_str(self): + self.assertEqual(self.coerce_type('8:28 AM', time), datetime(9999, 12, 31, 8, 28, 0)) + diff --git a/panda/tests/utils.py b/panda/tests/utils.py index aa2fdc9..7313d15 100644 --- a/panda/tests/utils.py +++ b/panda/tests/utils.py @@ -19,6 +19,7 @@ TEST_EXCEL_XLSX_FILENAME = 'contributors.excel.xlsx' TEST_OO_XLSX_FILENAME = 'contributors.oo.xlsx' TEST_LATIN1_FILENAME = 'test_not_unicode_sample.csv' +TEST_MONEY = 'test_money.csv' def setup_test_solr(): settings.SOLR_DATA_CORE = 'data_test' diff --git a/panda/utils/typecoercion.py b/panda/utils/typecoercion.py index e95c89e..a92989c 100644 --- a/panda/utils/typecoercion.py +++ b/panda/utils/typecoercion.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- from datetime import date, time, datetime @@ -17,6 +18,9 @@ 'time': time } +CURRENCY_SYMBOLS_ASCII = '$,' +CURRENCY_SYMBOLS_UNICODE_TRANSLATE_TABLE = dict([(ord(c), None) for c in '$,€£₱']) + class DataTyper(object): """ A callable object that adds typed columns to a Solr object based on a Dataset schema. @@ -98,6 +102,12 @@ def coerce_type(self, value, normal_type): return unicode(value) # int elif normal_type is int: + # Filter currency symbols + if isinstance(value, str): + value = value.translate(None, CURRENCY_SYMBOLS_ASCII) + elif isinstance(value, unicode): + value = value.translate(CURRENCY_SYMBOLS_UNICODE_TRANSLATE_TABLE) + return int(value) # bool elif normal_type is bool: @@ -114,6 +124,12 @@ def coerce_type(self, value, normal_type): return bool(value) # float elif normal_type is float: + # Filter currency symbols + if isinstance(value, str): + value = value.translate(None, CURRENCY_SYMBOLS_ASCII) + elif isinstance(value, unicode): + value = value.translate(CURRENCY_SYMBOLS_UNICODE_TRANSLATE_TABLE) + return float(value) # date, time, datetime elif normal_type in [date, time, datetime]: diff --git a/test_data/test_money.csv b/test_data/test_money.csv new file mode 100644 index 0000000..4312ea2 --- /dev/null +++ b/test_data/test_money.csv @@ -0,0 +1,9 @@ +product,price +MacBook Air 11-inch,$999 +MacBook Air 13-inch,$1299 +MacBook Pro 13-inch,$1199 +MacBook Pro 15-inch,$1799 +MacBook Pro 17-inch,$2599 +iPhone 4S,$199 +AT&T Service,$39.99 +Verizon Service,$39.99