modin-project · williamma12 · Apr 14, 2019 · Apr 14, 2019 · Apr 14, 2019 · williamma12
@@ -228,6 +228,19 @@ def _read_csv_from_file_pandas_on_ray(cls, filepath, kwargs={}):
         Returns:
             DataFrame or Series constructed from CSV file.
         """
+        names = kwargs.get("names", None)
+        index_col = kwargs.get("index_col", None)
+        if names is None:
+            # For the sake of the empty df, we assume no `index_col` to get the correct
+            # column names before we build the index. Because we pass `names` in, this
+            # step has to happen without removing the `index_col` otherwise it will not
+            # be assigned correctly
+            kwargs["index_col"] = None
+            names = pandas.read_csv(
+                open_file(filepath, "rb"), **dict(kwargs, nrows=0, skipfooter=0)
+            ).columns
+            kwargs["index_col"] = index_col
+
         empty_pd_df = pandas.read_csv(
             open_file(filepath, "rb"), **dict(kwargs, nrows=0, skipfooter=0)
         )
@@ -238,7 +251,7 @@ def _read_csv_from_file_pandas_on_ray(cls, filepath, kwargs={}):
         partition_kwargs = dict(
             kwargs,
             header=None,
-            names=column_names,
+            names=names,
             skipfooter=0,
             skiprows=None,
             parse_dates=parse_dates,
@@ -288,7 +301,6 @@ def _read_csv_from_file_pandas_on_ray(cls, filepath, kwargs={}):
                 )
                 index_ids.append(partition_id[-1])
 
-        index_col = kwargs.get("index_col", None)
         if index_col is None:
             new_index = pandas.RangeIndex(sum(ray.get(index_ids)))
         else:

@@ -78,7 +78,7 @@ def _read_csv_with_offset_pandas_on_ray(
             This is used to determine the total length of the DataFrame to build a
             default Index.
     """
-    index_col = kwargs.pop("index_col", None)
+    index_col = kwargs.get("index_col", None)
     bio = open_file(fname, "rb")
     bio.seek(start)
     to_read = header + bio.read(end - start)

@@ -0,0 +1,4 @@
+timestamp,symbol,high,low,open,close,spread,volume
+2010-04-01 00:00:00,USD/JPY,93.52600,93.36100,93.51800,93.38200,0.00500,3049
+2010-04-01 00:30:00,USD/JPY,93.47500,93.35200,93.38500,93.39100,0.00600,2251
+2010-04-01 01:00:00,USD/JPY,93.42100,93.32600,93.39100,93.38400,0.00600,1577
@@ -528,6 +528,118 @@ def test_from_csv(make_csv_file):
         assert modin_df_equals_pandas(modin_df, pandas_df)
 
 
+def test_parse_dates_read_csv():
+    pandas_df = pandas.read_csv("modin/pandas/test/data/test_time_parsing.csv")
+    modin_df = pd.read_csv("modin/pandas/test/data/test_time_parsing.csv")
+    modin_df_equals_pandas(modin_df, pandas_df)
+
+    pandas_df = pandas.read_csv(
+        "modin/pandas/test/data/test_time_parsing.csv",
+        names=[
+            "timestamp",
+            "symbol",
+            "high",
+            "low",
+            "open",
+            "close",
+            "spread",
+            "volume",
+        ],
+        header=0,
+        index_col=0,
+        encoding="utf-8",
+    )
+    modin_df = pd.read_csv(
+        "modin/pandas/test/data/test_time_parsing.csv",
+        names=[
+            "timestamp",
+            "symbol",
+            "high",
+            "low",
+            "open",
+            "close",
+            "spread",
+            "volume",
+        ],
+        header=0,
+        index_col=0,
+        encoding="utf-8",
+    )
+    modin_df_equals_pandas(modin_df, pandas_df)
+
+    pandas_df = pandas.read_csv(
+        "modin/pandas/test/data/test_time_parsing.csv",
+        names=[
+            "timestamp",
+            "symbol",
+            "high",
+            "low",
+            "open",
+            "close",
+            "spread",
+            "volume",
+        ],
+        header=0,
+        index_col=0,
+        parse_dates=["timestamp"],
+        encoding="utf-8",
+    )
+    modin_df = pd.read_csv(
+        "modin/pandas/test/data/test_time_parsing.csv",
+        names=[
+            "timestamp",
+            "symbol",
+            "high",
+            "low",
+            "open",
+            "close",
+            "spread",
+            "volume",
+        ],
+        header=0,
+        index_col=0,
+        parse_dates=["timestamp"],
+        encoding="utf-8",
+    )
+    modin_df_equals_pandas(modin_df, pandas_df)
+
+    pandas_df = pandas.read_csv(
+        "modin/pandas/test/data/test_time_parsing.csv",
+        names=[
+            "timestamp",
+            "symbol",
+            "high",
+            "low",
+            "open",
+            "close",
+            "spread",
+            "volume",
+        ],
+        header=0,
+        index_col=2,
+        parse_dates=["timestamp"],
+        encoding="utf-8",
+    )
+    modin_df = pd.read_csv(
+        "modin/pandas/test/data/test_time_parsing.csv",
+        names=[
+            "timestamp",
+            "symbol",
+            "high",
+            "low",
+            "open",
+            "close",
+            "spread",
+            "volume",
+        ],
+        header=0,
+        index_col=2,
+        parse_dates=["timestamp"],
+        encoding="utf-8",
+    )
+    modin_df_equals_pandas(modin_df, pandas_df)
+
+
 class FakeS3FS:
     def exists(self, path):
         return "s3://bucket/path.csv" == path