Skip to content

Commit

Permalink
Intelligently handle currency symbols in int and float columns. Closes
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed Apr 13, 2012
1 parent a88a5f8 commit 60640bb
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 1 deletion.
2 changes: 1 addition & 1 deletion panda/tests/__init__.py
Expand Up @@ -18,6 +18,6 @@
from panda.tests.test_solr import TestSolrJSONEncoder
from panda.tests.test_related_upload import TestRelatedUpload
from panda.tests.test_user import TestUser
from panda.tests.test_utils import TestCSV, TestXLS, TestXLSX
from panda.tests.test_utils import TestCSV, TestXLS, TestXLSX, TestTypeCoercion
from panda.tests.test_views import TestLogin, TestActivate

23 changes: 23 additions & 0 deletions panda/tests/test_dataset.py
Expand Up @@ -575,3 +575,26 @@ def test_generate_typed_column_names_conflict(self):

self.assertEqual([c['indexed_name'] for c in self.dataset.column_schema], ['column_int_test', None, 'column_unicode_test', 'column_unicode_test2'])

def test_reindex_with_currency(self):
upload = utils.get_test_data_upload(self.user, self.dataset, filename=utils.TEST_MONEY)
self.dataset.import_data(self.user, upload)

# Refresh from database
dataset = Dataset.objects.get(id=self.dataset.id)

dataset.reindex_data(self.user, typed_columns=[False, True], column_types=['unicode', 'float'])

# Refresh from database
dataset = Dataset.objects.get(id=self.dataset.id)

self.assertEqual([c['name'] for c in dataset.column_schema], ['product', 'price'])
self.assertEqual([c['type'] for c in dataset.column_schema], ['unicode', 'float'])
self.assertEqual([c['indexed'] for c in dataset.column_schema], [False, True])
self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, 'column_float_price'])
self.assertEqual([c['min'] for c in dataset.column_schema], [None, 39.99])
self.assertEqual([c['max'] for c in dataset.column_schema], [None, 2599.00])

self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:39.99')['response']['numFound'], 2)
self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:[1500 TO *]')['response']['numFound'], 2)
self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:*')['response']['numFound'], 8)

61 changes: 61 additions & 0 deletions panda/tests/test_utils.py
@@ -1,10 +1,12 @@
#!/usr/bin/env python

from datetime import date, time, datetime
import os.path

from django.test import TestCase

from panda import utils
from panda.exceptions import TypeCoercionError
from panda.tests import utils as test_utils

class TestCSV(TestCase):
Expand Down Expand Up @@ -114,3 +116,62 @@ def test_xlsx_guess_column_types(self):

self.assertEqual(guessed_types, ['unicode', 'date', 'int', 'bool', 'float', 'time', 'datetime', None, 'unicode'])

class TestTypeCoercion(TestCase):
def setUp(self):
self.data_typer = utils.typecoercion.DataTyper([])
self.coerce_type = self.data_typer.coerce_type

def test_coerce_nulls(self):
self.assertEqual(self.coerce_type(None, bool), None)
self.assertEqual(self.coerce_type('N/A', int), None)
self.assertEqual(self.coerce_type('n/a', datetime), None)

def test_coerce_int_from_str(self):
self.assertEqual(self.coerce_type('171', int), 171)

def test_coerce_int_from_str_fails(self):
with self.assertRaises(TypeCoercionError):
self.assertEqual(self.coerce_type('#171', int), 171)

def test_coerce_int_from_unicode(self):
self.assertEqual(self.coerce_type(u'171', int), 171)

def test_coerce_int_from_currency_str(self):
self.assertEqual(self.coerce_type('$171,000', int), 171000)

def test_coerce_int_from_currency_float(self):
self.assertEqual(self.coerce_type(u'$171,000', int), 171000)

def test_coerce_float_from_str(self):
self.assertEqual(self.coerce_type('171.59', float), 171.59)

def test_coerce_float_from_unicode(self):
self.assertEqual(self.coerce_type(u'171.59', float), 171.59)

def test_coerce_float_from_currency_str(self):
self.assertEqual(self.coerce_type('$171,000.59', float), 171000.59)

def test_coerce_float_from_currency_float(self):
self.assertEqual(self.coerce_type(u'$171,000.59', float), 171000.59)

def test_coerce_bool_from_str(self):
self.assertEqual(self.coerce_type('True', bool), True)
self.assertEqual(self.coerce_type('true', bool), True)
self.assertEqual(self.coerce_type('T', bool), True)
self.assertEqual(self.coerce_type('yes', bool), True)

def test_coerce_bool_from_unicode(self):
self.assertEqual(self.coerce_type(u'True', bool), True)
self.assertEqual(self.coerce_type(u'true', bool), True)
self.assertEqual(self.coerce_type(u'T', bool), True)
self.assertEqual(self.coerce_type(u'yes', bool), True)

def test_coerce_datetime_from_str(self):
self.assertEqual(self.coerce_type('2011-4-13 8:28 AM', datetime), datetime(2011, 4, 13, 8, 28, 0))

def test_coerce_date_from_str(self):
self.assertEqual(self.coerce_type('2011-4-13', date), datetime(2011, 4, 13, 0, 0, 0))

def test_coerce_time_from_str(self):
self.assertEqual(self.coerce_type('8:28 AM', time), datetime(9999, 12, 31, 8, 28, 0))

1 change: 1 addition & 0 deletions panda/tests/utils.py
Expand Up @@ -19,6 +19,7 @@
TEST_EXCEL_XLSX_FILENAME = 'contributors.excel.xlsx'
TEST_OO_XLSX_FILENAME = 'contributors.oo.xlsx'
TEST_LATIN1_FILENAME = 'test_not_unicode_sample.csv'
TEST_MONEY = 'test_money.csv'

def setup_test_solr():
settings.SOLR_DATA_CORE = 'data_test'
Expand Down
16 changes: 16 additions & 0 deletions panda/utils/typecoercion.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from datetime import date, time, datetime

Expand All @@ -17,6 +18,9 @@
'time': time
}

CURRENCY_SYMBOLS_ASCII = '$,'
CURRENCY_SYMBOLS_UNICODE_TRANSLATE_TABLE = dict([(ord(c), None) for c in '$,€£₱'])

class DataTyper(object):
"""
A callable object that adds typed columns to a Solr object based on a Dataset schema.
Expand Down Expand Up @@ -98,6 +102,12 @@ def coerce_type(self, value, normal_type):
return unicode(value)
# int
elif normal_type is int:
# Filter currency symbols
if isinstance(value, str):
value = value.translate(None, CURRENCY_SYMBOLS_ASCII)
elif isinstance(value, unicode):
value = value.translate(CURRENCY_SYMBOLS_UNICODE_TRANSLATE_TABLE)

return int(value)
# bool
elif normal_type is bool:
Expand All @@ -114,6 +124,12 @@ def coerce_type(self, value, normal_type):
return bool(value)
# float
elif normal_type is float:
# Filter currency symbols
if isinstance(value, str):
value = value.translate(None, CURRENCY_SYMBOLS_ASCII)
elif isinstance(value, unicode):
value = value.translate(CURRENCY_SYMBOLS_UNICODE_TRANSLATE_TABLE)

return float(value)
# date, time, datetime
elif normal_type in [date, time, datetime]:
Expand Down
9 changes: 9 additions & 0 deletions test_data/test_money.csv
@@ -0,0 +1,9 @@
product,price
MacBook Air 11-inch,$999
MacBook Air 13-inch,$1299
MacBook Pro 13-inch,$1199
MacBook Pro 15-inch,$1799
MacBook Pro 17-inch,$2599
iPhone 4S,$199
AT&T Service,$39.99
Verizon Service,$39.99

0 comments on commit 60640bb

Please sign in to comment.