Add support for datetime64 (#324)

* Add support for datetime64 * Add tests for nanoseconds * Add tests for nanoseconds for row method
heavyai · Jul 6, 2020 · 868ebdb · 868ebdb
1 parent fcb98c7
commit 868ebdb
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 44 deletions.
diff --git a/pymapd/_pandas_loaders.py b/pymapd/_pandas_loaders.py
@@ -77,6 +77,8 @@ def get_mapd_type_from_object(data):
 
     if isinstance(val, str):
         return 'STR'
+    elif isinstance(val, np.datetime64):
+        return 'TIMESTAMP'
     elif isinstance(val, datetime.date):
         return 'DATE'
     elif isinstance(val, datetime.time):
@@ -290,4 +292,8 @@ def build_row_desc(data, preserve_index=False):
             tct.col_type.encoding = 4
         elif tct.col_type.type in GEO_TYPE_ID:
             tct.col_type.precision = 23
+        elif tct.col_type.type == 8:
+            # force precision for timestamp with nanoseconds
+            if data[tct.col_name].dt.nanosecond.sum():
+                tct.col_type.precision = 9
     return row_desc
diff --git a/pymapd/_utils.py b/pymapd/_utils.py
@@ -24,20 +24,27 @@ def datetime_to_seconds(arr):
         if arr.dtype == 'int64':
             # The user has passed a unix timestamp already
             return arr
-        elif arr.dtype == 'object' or str(arr.dtype).startswith(
-            'datetime64[ns,'
+
+        if not (
+            arr.dtype == 'object'
+            or str(arr.dtype).startswith('datetime64[ns,')
         ):
-            # Convert to datetime64[ns] from string
-            # Or from datetime with timezone information
-            # Return timestamp in 'UTC'
-            arr = pd.to_datetime(arr, utc=True)
-        else:
             raise TypeError(
                 f"Invalid dtype '{arr.dtype}', expected one of: "
                 "datetime64[ns], int64 (UNIX epoch), "
                 "or object (string)"
             )
-    return arr.view('i8') // 10 ** 9  # ns -> s since epoch
+
+        # Convert to datetime64[ns] from string
+        # Or from datetime with timezone information
+        # Return timestamp in 'UTC'
+        arr = pd.to_datetime(arr, utc=True)
+        return arr.view('i8') // 10 ** 9  # ns -> s since epoch
+    else:
+        if arr.dt.nanosecond.sum():
+            return arr.view('i8')  # ns -> s since epoch
+        else:
+            return arr.view('i8') // 10 ** 9
 
 
 def datetime_in_precisions(epoch, precision):
@@ -52,12 +59,7 @@ def datetime_in_precisions(epoch, precision):
         seconds, modulus = divmod(epoch, 1000000)
         return base + datetime.timedelta(seconds=seconds, microseconds=modulus)
     elif precision == 9:
-        """ TODO(Wamsi): datetime.timedelta has support only till microseconds.
-                         Need to find an alternative and fix nanoseconds
-                         granularity"""
-        epoch /= 1000
-        seconds, modulus = divmod(epoch, 1000000)
-        return base + datetime.timedelta(seconds=seconds, microseconds=modulus)
+        return np.datetime64(epoch, 'ns')
     else:
         raise TypeError("Invalid timestamp precision: {}".format(precision))
 

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -800,6 +800,19 @@ def test_load_empty_table_arrow(self, con):
                 'a int, b float, c text',
                 id='scalar_values',
             ),
+            pytest.param(
+                pd.DataFrame(
+                    {
+                        "a": [
+                            np.datetime64('2010-01-01 01:01:01.001001001'),
+                            np.datetime64('2011-01-01 01:01:01.001001001'),
+                            np.datetime64('2012-01-01 01:01:01.001001001'),
+                        ],
+                    },
+                ),
+                'a TIMESTAMP(9)',
+                id='scalar_datetime_nanoseconds',
+            ),
             pytest.param(
                 pd.DataFrame(
                     [
@@ -1129,10 +1142,29 @@ def test_select_null(self, con):
         'df, expected',
         [
             (
-                pd.DataFrame({"a": [1, 2], "b": [1.0, 2.0]}),
+                pd.DataFrame(
+                    {
+                        "a": [1, 2],
+                        "b": [1.0, 2.0],
+                        "c": [
+                            datetime.date(2016, 1, 1),
+                            datetime.date(2017, 1, 1),
+                        ],
+                        "d": [
+                            np.datetime64("2010-01-01T01:01:01.001001001"),
+                            np.datetime64("2011-01-01T01:01:01.001001001"),
+                        ],
+                    }
+                ),
                 {
-                    'a': {'type_code': TDatumType.BIGINT, 'is_array': False},
-                    'b': {'type_code': TDatumType.DOUBLE, 'is_array': False},
+                    'a': {'type': 'BIGINT', 'is_array': False},
+                    'b': {'type': 'DOUBLE', 'is_array': False},
+                    'c': {'type': 'DATE', 'is_array': False},
+                    'd': {
+                        'type': 'TIMESTAMP',
+                        'is_array': False,
+                        'precision': 9,
+                    },
                 },
             ),
             (
@@ -1158,10 +1190,10 @@ def test_select_null(self, con):
                     }
                 ),
                 {
-                    'a': {'type_code': TDatumType.BIGINT, 'is_array': True},
-                    'b': {'type_code': TDatumType.STR, 'is_array': False},
-                    'c': {'type_code': TDatumType.DOUBLE, 'is_array': True},
-                    'd': {'type_code': TDatumType.BIGINT, 'is_array': True},
+                    'a': {'type': 'BIGINT', 'is_array': True},
+                    'b': {'type': 'STR', 'is_array': False},
+                    'c': {'type': 'DOUBLE', 'is_array': True},
+                    'd': {'type': 'BIGINT', 'is_array': True},
                 },
             ),
             (
@@ -1209,26 +1241,20 @@ def test_select_null(self, con):
                     }
                 ),
                 {
-                    'a': {'type_code': TDatumType.POINT, 'is_array': True},
-                    'b': {
-                        'type_code': TDatumType.LINESTRING,
-                        'is_array': True,
-                    },
-                    'c': {'type_code': TDatumType.POLYGON, 'is_array': True},
-                    'd': {
-                        'type_code': TDatumType.MULTIPOLYGON,
-                        'is_array': True,
-                    },
+                    'a': {'type': 'POINT', 'is_array': True},
+                    'b': {'type': 'LINESTRING', 'is_array': True},
+                    'c': {'type': 'POLYGON', 'is_array': True},
+                    'd': {'type': 'MULTIPOLYGON', 'is_array': True},
                 },
             ),
         ],
     )
     def test_create_table(self, con, tmp_table, df, expected):
         con.create_table(tmp_table, df)
-        cur = con.execute('SELECT * FROM {}'.format(tmp_table))
-        # import pdb; pdb.set_trace()
-        for col in cur.description:
-            assert expected[col.name]['type_code'] == col.type_code
+        for col in con.get_table_details(tmp_table):
+            assert expected[col.name]['type'] == col.type
+            if 'precision' in expected[col.name]:
+                assert expected[col.name]['precision'] == col.precision
 
     def test_load_table_creates(self, con):
 
@@ -1244,7 +1270,11 @@ def test_load_table_creates(self, con):
                 "varchar_": ["a", "b"],
                 "text_": ['a', 'b'],
                 "time_": [datetime.time(0, 11, 59), datetime.time(13)],
-                "timestamp_": [pd.Timestamp("2016"), pd.Timestamp("2017")],
+                "timestamp1_": [pd.Timestamp("2016"), pd.Timestamp("2017")],
+                "timestamp2_": [
+                    np.datetime64("2010-01-01T01:01:01.001001001"),
+                    np.datetime64("2011-01-01T01:01:01.001001001"),
+                ],
                 "date_": [
                     datetime.date(2016, 1, 1),
                     datetime.date(2017, 1, 1),
@@ -1260,7 +1290,8 @@ def test_load_table_creates(self, con):
                 'varchar_',
                 'text_',
                 'time_',
-                'timestamp_',
+                'timestamp1_',
+                'timestamp2_',
                 'date_',
             ],
         )
@@ -1313,17 +1344,19 @@ def test_array_in_result_set(self, con):
         con.execute("DROP TABLE IF EXISTS test_lists;")
         con.execute(
             "CREATE TABLE IF NOT EXISTS test_lists \
-                    (col1 TEXT, col2 TIMESTAMP[]);"
+                    (col1 TEXT, col2 TIMESTAMP[], col3 TIMESTAMP(9));"
         )
 
         row = [
             (
                 "row1",
                 "{2019-03-02 00:00:00,2019-03-02 00:00:00,2019-03-02 00:00:00}",  # noqa
+                "2010-01-01T01:01:01.001001001",
             ),
             (
                 "row2",
                 "{2019-03-02 00:00:00,2019-03-02 00:00:00,2019-03-02 00:00:00}",  # noqa
+                "2011-01-01T01:01:01.001001001",
             ),
         ]
 
@@ -1338,6 +1371,7 @@ def test_array_in_result_set(self, con):
                     datetime.datetime(2019, 3, 2, 0, 0),
                     datetime.datetime(2019, 3, 2, 0, 0),
                 ],
+                np.datetime64("2010-01-01T01:01:01.001001001"),
             ),
             (
                 'row2',
@@ -1346,9 +1380,9 @@ def test_array_in_result_set(self, con):
                     datetime.datetime(2019, 3, 2, 0, 0),
                     datetime.datetime(2019, 3, 2, 0, 0),
                 ],
+                np.datetime64("2011-01-01T01:01:01.001001001"),
             ),
         ]
-
         assert ans == expected
 
         # date

diff --git a/tests/test_loaders.py b/tests/test_loaders.py
@@ -47,11 +47,17 @@ def get_col_types(col_properties: dict):
 
 def get_expected(data, col_properties):
     expected = []
-    _map_col_types = {'INT': 'int_col', 'DOUBLE': 'real_col', 'STR': 'str_col'}
+    _map_col_types = {
+        'INT': 'int_col',
+        'DOUBLE': 'real_col',
+        'STR': 'str_col',
+        'TIMESTAMP': 'int_col',
+    }
     _map_col_types.update(
         {k: 'str_col' for k in _pandas_loaders.GEO_TYPE_NAMES}
     )
     isnull = data.isnull()
+
     for prop in col_properties:
         nulls = isnull[prop['name']].tolist()
         if prop['is_array']:
@@ -75,6 +81,15 @@ def get_expected(data, col_properties):
                 nulls=nulls,
             )
         else:
+            if prop['type'] == 'TIMESTAMP':
+                # convert datetime to epoch
+                if data[prop['name']].dt.nanosecond.sum():
+                    data[prop['name']] = data[prop['name']].astype(int)
+                else:
+                    data[prop['name']] = (
+                        data[prop['name']].astype(int) // 10 ** 9
+                    )
+
             col = TColumn(
                 data=TColumnData(
                     **{_map_col_types[prop['type']]: data[prop['name']]}
@@ -87,19 +102,51 @@ def get_expected(data, col_properties):
 
 class TestLoaders:
     def test_build_input_rows(self):
-        data = [(1, 'a'), (2, 'b')]
+        dt_microsecond_format = '%Y-%m-%d %H:%M:%S.%f'
+
+        def get_dt_nanosecond(v):
+            return np.datetime64('201{}-01-01 01:01:01.001001001'.format(v))
+
+        def get_dt_microsecond(v):
+            return datetime.datetime.strptime(
+                '201{}-01-01 01:01:01.001001'.format(v), dt_microsecond_format
+            )
+
+        data = [
+            (1, 'a', get_dt_nanosecond(1), get_dt_microsecond(1)),
+            (2, 'b', get_dt_nanosecond(2), get_dt_microsecond(2)),
+        ]
         result = _build_input_rows(data)
+        # breakpoint
         expected = [
             TStringRow(
                 cols=[
                     TStringValue(str_val='1', is_null=None),
                     TStringValue(str_val='a', is_null=None),
+                    TStringValue(
+                        str_val=get_dt_nanosecond(1).astype(str), is_null=None
+                    ),
+                    TStringValue(
+                        str_val=get_dt_microsecond(1).strftime(
+                            dt_microsecond_format
+                        ),
+                        is_null=None,
+                    ),
                 ]
             ),
             TStringRow(
                 cols=[
                     TStringValue(str_val='2', is_null=None),
                     TStringValue(str_val='b', is_null=None),
+                    TStringValue(
+                        str_val=get_dt_nanosecond(2).astype(str), is_null=None
+                    ),
+                    TStringValue(
+                        str_val=get_dt_microsecond(2).strftime(
+                            dt_microsecond_format
+                        ),
+                        is_null=None,
+                    ),
                 ]
             ),
         ]
@@ -141,12 +188,33 @@ def test_build_input_rows_with_array(self):
                         'a': [[1, 1], [2, 2], [3, 3]],
                         'b': [[1.1, 1.1], [2.2, 2.2], [3.3, 3.3]],
                         'c': [1, 2, 3],
+                        'd': [
+                            np.datetime64('2010-01-01 01:01:01.001001001'),
+                            np.datetime64('2011-01-01 01:01:01.001001001'),
+                            np.datetime64('2012-01-01 01:01:01.001001001'),
+                        ],
+                        'e': [
+                            datetime.datetime.strptime(
+                                '2010-01-01 01:01:01.001001',
+                                '%Y-%m-%d %H:%M:%S.%f',
+                            ),
+                            datetime.datetime.strptime(
+                                '2011-01-01 01:01:01.001001',
+                                '%Y-%m-%d %H:%M:%S.%f',
+                            ),
+                            datetime.datetime.strptime(
+                                '2012-01-01 01:01:01.001001',
+                                '%Y-%m-%d %H:%M:%S.%f',
+                            ),
+                        ],
                     }
                 ),
                 [
                     {'name': 'a', 'type': 'INT', 'is_array': True},
                     {'name': 'b', 'type': 'DOUBLE', 'is_array': True},
                     {'name': 'c', 'type': 'INT', 'is_array': False},
+                    {'name': 'd', 'type': 'TIMESTAMP', 'is_array': False},
+                    {'name': 'e', 'type': 'TIMESTAMP', 'is_array': False},
                 ],
                 id='mult-cols-mix-array-not-null',
             ),
@@ -478,7 +546,11 @@ def test_build_row_desc(self):
                 'varchar_': ['a', 'b'],
                 'text_': ['a', 'b'],
                 'time_': [datetime.time(0, 11, 59), datetime.time(13)],
-                'timestamp_': [pd.Timestamp('2016'), pd.Timestamp('2017')],
+                'timestamp1_': [pd.Timestamp('2016'), pd.Timestamp('2017')],
+                'timestamp2_': [
+                    np.datetime64('2016-01-01 01:01:01.001001001'),
+                    np.datetime64('2017-01-01 01:01:01.001001001'),
+                ],
                 'date_': [
                     datetime.date(2016, 1, 1),
                     datetime.date(2017, 1, 1),
@@ -494,7 +566,8 @@ def test_build_row_desc(self):
                 'varchar_',
                 'text_',
                 'time_',
-                'timestamp_',
+                'timestamp1_',
+                'timestamp2_',
                 'date_',
             ],
         )
@@ -525,7 +598,10 @@ def test_build_row_desc(self):
                 col_name='text_', col_type=TTypeInfo(type=6, encoding=4)
             ),
             TColumnType(col_name='time_', col_type=TTypeInfo(type=7)),
-            TColumnType(col_name='timestamp_', col_type=TTypeInfo(type=8)),
+            TColumnType(col_name='timestamp1_', col_type=TTypeInfo(type=8)),
+            TColumnType(
+                col_name='timestamp2_', col_type=TTypeInfo(type=8, precision=9)
+            ),
             TColumnType(col_name='date_', col_type=TTypeInfo(type=9)),
         ]