Skip to content

Commit

Permalink
Add support for datetime64 (#324)
Browse files Browse the repository at this point in the history
* Add support for datetime64

* Add tests for nanoseconds

* Add tests for nanoseconds for row method
  • Loading branch information
xmnlab committed Jul 6, 2020
1 parent fcb98c7 commit 868ebdb
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 44 deletions.
6 changes: 6 additions & 0 deletions pymapd/_pandas_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ def get_mapd_type_from_object(data):

if isinstance(val, str):
return 'STR'
elif isinstance(val, np.datetime64):
return 'TIMESTAMP'
elif isinstance(val, datetime.date):
return 'DATE'
elif isinstance(val, datetime.time):
Expand Down Expand Up @@ -290,4 +292,8 @@ def build_row_desc(data, preserve_index=False):
tct.col_type.encoding = 4
elif tct.col_type.type in GEO_TYPE_ID:
tct.col_type.precision = 23
elif tct.col_type.type == 8:
# force precision for timestamp with nanoseconds
if data[tct.col_name].dt.nanosecond.sum():
tct.col_type.precision = 9
return row_desc
30 changes: 16 additions & 14 deletions pymapd/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,27 @@ def datetime_to_seconds(arr):
if arr.dtype == 'int64':
# The user has passed a unix timestamp already
return arr
elif arr.dtype == 'object' or str(arr.dtype).startswith(
'datetime64[ns,'

if not (
arr.dtype == 'object'
or str(arr.dtype).startswith('datetime64[ns,')
):
# Convert to datetime64[ns] from string
# Or from datetime with timezone information
# Return timestamp in 'UTC'
arr = pd.to_datetime(arr, utc=True)
else:
raise TypeError(
f"Invalid dtype '{arr.dtype}', expected one of: "
"datetime64[ns], int64 (UNIX epoch), "
"or object (string)"
)
return arr.view('i8') // 10 ** 9 # ns -> s since epoch

# Convert to datetime64[ns] from string
# Or from datetime with timezone information
# Return timestamp in 'UTC'
arr = pd.to_datetime(arr, utc=True)
return arr.view('i8') // 10 ** 9 # ns -> s since epoch
else:
if arr.dt.nanosecond.sum():
return arr.view('i8') # ns -> s since epoch
else:
return arr.view('i8') // 10 ** 9


def datetime_in_precisions(epoch, precision):
Expand All @@ -52,12 +59,7 @@ def datetime_in_precisions(epoch, precision):
seconds, modulus = divmod(epoch, 1000000)
return base + datetime.timedelta(seconds=seconds, microseconds=modulus)
elif precision == 9:
""" TODO(Wamsi): datetime.timedelta has support only till microseconds.
Need to find an alternative and fix nanoseconds
granularity"""
epoch /= 1000
seconds, modulus = divmod(epoch, 1000000)
return base + datetime.timedelta(seconds=seconds, microseconds=modulus)
return np.datetime64(epoch, 'ns')
else:
raise TypeError("Invalid timestamp precision: {}".format(precision))

Expand Down
84 changes: 59 additions & 25 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,19 @@ def test_load_empty_table_arrow(self, con):
'a int, b float, c text',
id='scalar_values',
),
pytest.param(
pd.DataFrame(
{
"a": [
np.datetime64('2010-01-01 01:01:01.001001001'),
np.datetime64('2011-01-01 01:01:01.001001001'),
np.datetime64('2012-01-01 01:01:01.001001001'),
],
},
),
'a TIMESTAMP(9)',
id='scalar_datetime_nanoseconds',
),
pytest.param(
pd.DataFrame(
[
Expand Down Expand Up @@ -1129,10 +1142,29 @@ def test_select_null(self, con):
'df, expected',
[
(
pd.DataFrame({"a": [1, 2], "b": [1.0, 2.0]}),
pd.DataFrame(
{
"a": [1, 2],
"b": [1.0, 2.0],
"c": [
datetime.date(2016, 1, 1),
datetime.date(2017, 1, 1),
],
"d": [
np.datetime64("2010-01-01T01:01:01.001001001"),
np.datetime64("2011-01-01T01:01:01.001001001"),
],
}
),
{
'a': {'type_code': TDatumType.BIGINT, 'is_array': False},
'b': {'type_code': TDatumType.DOUBLE, 'is_array': False},
'a': {'type': 'BIGINT', 'is_array': False},
'b': {'type': 'DOUBLE', 'is_array': False},
'c': {'type': 'DATE', 'is_array': False},
'd': {
'type': 'TIMESTAMP',
'is_array': False,
'precision': 9,
},
},
),
(
Expand All @@ -1158,10 +1190,10 @@ def test_select_null(self, con):
}
),
{
'a': {'type_code': TDatumType.BIGINT, 'is_array': True},
'b': {'type_code': TDatumType.STR, 'is_array': False},
'c': {'type_code': TDatumType.DOUBLE, 'is_array': True},
'd': {'type_code': TDatumType.BIGINT, 'is_array': True},
'a': {'type': 'BIGINT', 'is_array': True},
'b': {'type': 'STR', 'is_array': False},
'c': {'type': 'DOUBLE', 'is_array': True},
'd': {'type': 'BIGINT', 'is_array': True},
},
),
(
Expand Down Expand Up @@ -1209,26 +1241,20 @@ def test_select_null(self, con):
}
),
{
'a': {'type_code': TDatumType.POINT, 'is_array': True},
'b': {
'type_code': TDatumType.LINESTRING,
'is_array': True,
},
'c': {'type_code': TDatumType.POLYGON, 'is_array': True},
'd': {
'type_code': TDatumType.MULTIPOLYGON,
'is_array': True,
},
'a': {'type': 'POINT', 'is_array': True},
'b': {'type': 'LINESTRING', 'is_array': True},
'c': {'type': 'POLYGON', 'is_array': True},
'd': {'type': 'MULTIPOLYGON', 'is_array': True},
},
),
],
)
def test_create_table(self, con, tmp_table, df, expected):
con.create_table(tmp_table, df)
cur = con.execute('SELECT * FROM {}'.format(tmp_table))
# import pdb; pdb.set_trace()
for col in cur.description:
assert expected[col.name]['type_code'] == col.type_code
for col in con.get_table_details(tmp_table):
assert expected[col.name]['type'] == col.type
if 'precision' in expected[col.name]:
assert expected[col.name]['precision'] == col.precision

def test_load_table_creates(self, con):

Expand All @@ -1244,7 +1270,11 @@ def test_load_table_creates(self, con):
"varchar_": ["a", "b"],
"text_": ['a', 'b'],
"time_": [datetime.time(0, 11, 59), datetime.time(13)],
"timestamp_": [pd.Timestamp("2016"), pd.Timestamp("2017")],
"timestamp1_": [pd.Timestamp("2016"), pd.Timestamp("2017")],
"timestamp2_": [
np.datetime64("2010-01-01T01:01:01.001001001"),
np.datetime64("2011-01-01T01:01:01.001001001"),
],
"date_": [
datetime.date(2016, 1, 1),
datetime.date(2017, 1, 1),
Expand All @@ -1260,7 +1290,8 @@ def test_load_table_creates(self, con):
'varchar_',
'text_',
'time_',
'timestamp_',
'timestamp1_',
'timestamp2_',
'date_',
],
)
Expand Down Expand Up @@ -1313,17 +1344,19 @@ def test_array_in_result_set(self, con):
con.execute("DROP TABLE IF EXISTS test_lists;")
con.execute(
"CREATE TABLE IF NOT EXISTS test_lists \
(col1 TEXT, col2 TIMESTAMP[]);"
(col1 TEXT, col2 TIMESTAMP[], col3 TIMESTAMP(9));"
)

row = [
(
"row1",
"{2019-03-02 00:00:00,2019-03-02 00:00:00,2019-03-02 00:00:00}", # noqa
"2010-01-01T01:01:01.001001001",
),
(
"row2",
"{2019-03-02 00:00:00,2019-03-02 00:00:00,2019-03-02 00:00:00}", # noqa
"2011-01-01T01:01:01.001001001",
),
]

Expand All @@ -1338,6 +1371,7 @@ def test_array_in_result_set(self, con):
datetime.datetime(2019, 3, 2, 0, 0),
datetime.datetime(2019, 3, 2, 0, 0),
],
np.datetime64("2010-01-01T01:01:01.001001001"),
),
(
'row2',
Expand All @@ -1346,9 +1380,9 @@ def test_array_in_result_set(self, con):
datetime.datetime(2019, 3, 2, 0, 0),
datetime.datetime(2019, 3, 2, 0, 0),
],
np.datetime64("2011-01-01T01:01:01.001001001"),
),
]

assert ans == expected

# date
Expand Down
86 changes: 81 additions & 5 deletions tests/test_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,17 @@ def get_col_types(col_properties: dict):

def get_expected(data, col_properties):
expected = []
_map_col_types = {'INT': 'int_col', 'DOUBLE': 'real_col', 'STR': 'str_col'}
_map_col_types = {
'INT': 'int_col',
'DOUBLE': 'real_col',
'STR': 'str_col',
'TIMESTAMP': 'int_col',
}
_map_col_types.update(
{k: 'str_col' for k in _pandas_loaders.GEO_TYPE_NAMES}
)
isnull = data.isnull()

for prop in col_properties:
nulls = isnull[prop['name']].tolist()
if prop['is_array']:
Expand All @@ -75,6 +81,15 @@ def get_expected(data, col_properties):
nulls=nulls,
)
else:
if prop['type'] == 'TIMESTAMP':
# convert datetime to epoch
if data[prop['name']].dt.nanosecond.sum():
data[prop['name']] = data[prop['name']].astype(int)
else:
data[prop['name']] = (
data[prop['name']].astype(int) // 10 ** 9
)

col = TColumn(
data=TColumnData(
**{_map_col_types[prop['type']]: data[prop['name']]}
Expand All @@ -87,19 +102,51 @@ def get_expected(data, col_properties):

class TestLoaders:
def test_build_input_rows(self):
data = [(1, 'a'), (2, 'b')]
dt_microsecond_format = '%Y-%m-%d %H:%M:%S.%f'

def get_dt_nanosecond(v):
return np.datetime64('201{}-01-01 01:01:01.001001001'.format(v))

def get_dt_microsecond(v):
return datetime.datetime.strptime(
'201{}-01-01 01:01:01.001001'.format(v), dt_microsecond_format
)

data = [
(1, 'a', get_dt_nanosecond(1), get_dt_microsecond(1)),
(2, 'b', get_dt_nanosecond(2), get_dt_microsecond(2)),
]
result = _build_input_rows(data)
# breakpoint
expected = [
TStringRow(
cols=[
TStringValue(str_val='1', is_null=None),
TStringValue(str_val='a', is_null=None),
TStringValue(
str_val=get_dt_nanosecond(1).astype(str), is_null=None
),
TStringValue(
str_val=get_dt_microsecond(1).strftime(
dt_microsecond_format
),
is_null=None,
),
]
),
TStringRow(
cols=[
TStringValue(str_val='2', is_null=None),
TStringValue(str_val='b', is_null=None),
TStringValue(
str_val=get_dt_nanosecond(2).astype(str), is_null=None
),
TStringValue(
str_val=get_dt_microsecond(2).strftime(
dt_microsecond_format
),
is_null=None,
),
]
),
]
Expand Down Expand Up @@ -141,12 +188,33 @@ def test_build_input_rows_with_array(self):
'a': [[1, 1], [2, 2], [3, 3]],
'b': [[1.1, 1.1], [2.2, 2.2], [3.3, 3.3]],
'c': [1, 2, 3],
'd': [
np.datetime64('2010-01-01 01:01:01.001001001'),
np.datetime64('2011-01-01 01:01:01.001001001'),
np.datetime64('2012-01-01 01:01:01.001001001'),
],
'e': [
datetime.datetime.strptime(
'2010-01-01 01:01:01.001001',
'%Y-%m-%d %H:%M:%S.%f',
),
datetime.datetime.strptime(
'2011-01-01 01:01:01.001001',
'%Y-%m-%d %H:%M:%S.%f',
),
datetime.datetime.strptime(
'2012-01-01 01:01:01.001001',
'%Y-%m-%d %H:%M:%S.%f',
),
],
}
),
[
{'name': 'a', 'type': 'INT', 'is_array': True},
{'name': 'b', 'type': 'DOUBLE', 'is_array': True},
{'name': 'c', 'type': 'INT', 'is_array': False},
{'name': 'd', 'type': 'TIMESTAMP', 'is_array': False},
{'name': 'e', 'type': 'TIMESTAMP', 'is_array': False},
],
id='mult-cols-mix-array-not-null',
),
Expand Down Expand Up @@ -478,7 +546,11 @@ def test_build_row_desc(self):
'varchar_': ['a', 'b'],
'text_': ['a', 'b'],
'time_': [datetime.time(0, 11, 59), datetime.time(13)],
'timestamp_': [pd.Timestamp('2016'), pd.Timestamp('2017')],
'timestamp1_': [pd.Timestamp('2016'), pd.Timestamp('2017')],
'timestamp2_': [
np.datetime64('2016-01-01 01:01:01.001001001'),
np.datetime64('2017-01-01 01:01:01.001001001'),
],
'date_': [
datetime.date(2016, 1, 1),
datetime.date(2017, 1, 1),
Expand All @@ -494,7 +566,8 @@ def test_build_row_desc(self):
'varchar_',
'text_',
'time_',
'timestamp_',
'timestamp1_',
'timestamp2_',
'date_',
],
)
Expand Down Expand Up @@ -525,7 +598,10 @@ def test_build_row_desc(self):
col_name='text_', col_type=TTypeInfo(type=6, encoding=4)
),
TColumnType(col_name='time_', col_type=TTypeInfo(type=7)),
TColumnType(col_name='timestamp_', col_type=TTypeInfo(type=8)),
TColumnType(col_name='timestamp1_', col_type=TTypeInfo(type=8)),
TColumnType(
col_name='timestamp2_', col_type=TTypeInfo(type=8, precision=9)
),
TColumnType(col_name='date_', col_type=TTypeInfo(type=9)),
]

Expand Down

0 comments on commit 868ebdb

Please sign in to comment.