Skip to content

Commit

Permalink
resolve conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
natmokval committed May 11, 2024
2 parents 6bbba80 + 4d9ffcf commit 9f88ab3
Show file tree
Hide file tree
Showing 26 changed files with 285 additions and 1,636 deletions.
10 changes: 0 additions & 10 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,16 +445,6 @@ def setup(self, engine):
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)

def time_multiple_date(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=list(string.digits[:9]),
parse_dates=[[1, 2], [1, 3]],
)

def time_baseline(self, engine):
read_csv(
self.data(self.StringIO_input),
Expand Down
25 changes: 1 addition & 24 deletions asv_bench/benchmarks/io/parsers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
import numpy as np

try:
from pandas._libs.tslibs.parsing import (
_does_string_look_like_datetime,
concat_date_cols,
)
from pandas._libs.tslibs.parsing import _does_string_look_like_datetime
except ImportError:
# Avoid whole benchmark suite import failure on asv (currently 0.4)
pass
Expand All @@ -20,21 +15,3 @@ def setup(self, value):
def time_check_datetimes(self, value):
for obj in self.objects:
_does_string_look_like_datetime(obj)


class ConcatDateCols:
params = ([1234567890, "AAAA"], [1, 2])
param_names = ["value", "dim"]

def setup(self, value, dim):
count_elem = 10000
if dim == 1:
self.object = (np.array([value] * count_elem),)
if dim == 2:
self.object = (
np.array([value] * count_elem),
np.array([value] * count_elem),
)

def time_check_concat(self, value, dim):
concat_date_cols(self.object)
16 changes: 10 additions & 6 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,14 @@ def time_searchsorted(self, dtype):


class Map:
params = (["dict", "Series", "lambda"], ["object", "category", "int"])
param_names = "mapper"

def setup(self, mapper, dtype):
params = (
["dict", "Series", "lambda"],
["object", "category", "int"],
[None, "ignore"],
)
param_names = ["mapper", "dtype", "na_action"]

def setup(self, mapper, dtype, na_action):
map_size = 1000
map_data = Series(map_size - np.arange(map_size), dtype=dtype)

Expand All @@ -168,8 +172,8 @@ def setup(self, mapper, dtype):

self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype)

def time_map(self, mapper, *args, **kwargs):
self.s.map(self.map_data)
def time_map(self, mapper, dtype, na_action):
self.s.map(self.map_data, na_action=na_action)


class Clip:
Expand Down
81 changes: 1 addition & 80 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -262,15 +262,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default
* If ``True`` -> try parsing the index.
* If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date
column.
* If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date
column.
* If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'.

.. note::
A fast-path exists for iso8601-formatted dates.
keep_date_col : boolean, default ``False``
If ``True`` and parse_dates specifies combining multiple columns then keep the
original columns.
date_format : str or dict of column -> format, default ``None``
If used in conjunction with ``parse_dates``, will parse dates according to this
format. For anything more complex,
Expand Down Expand Up @@ -802,71 +796,8 @@ The simplest case is to just pass in ``parse_dates=True``:
It is often the case that we may want to store date and time data separately,
or store various date fields separately. the ``parse_dates`` keyword can be
used to specify a combination of columns to parse the dates and/or times from.

You can specify a list of column lists to ``parse_dates``, the resulting date
columns will be prepended to the output (so as to not affect the existing column
order) and the new column names will be the concatenation of the component
column names:

.. ipython:: python
:okwarning:
data = (
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
"KORD,19990127, 23:00:00, 22:56:00, -0.5900"
)
with open("tmp.csv", "w") as fh:
fh.write(data)
df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])
df
By default the parser removes the component date columns, but you can choose
to retain them via the ``keep_date_col`` keyword:

.. ipython:: python
:okwarning:
df = pd.read_csv(
"tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
)
df
used to specify columns to parse the dates and/or times.

Note that if you wish to combine multiple columns into a single date column, a
nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that
the second and third columns should each be parsed as separate date columns
while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a
single column.

You can also use a dict to specify custom name columns:

.. ipython:: python
:okwarning:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)
df
It is important to remember that if multiple text columns are to be parsed into
a single date column, then a new column is prepended to the data. The ``index_col``
specification is based off of this new set of columns rather than the original
data columns:


.. ipython:: python
:okwarning:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
df = pd.read_csv(
"tmp.csv", header=None, parse_dates=date_spec, index_col=0
) # index is the nominal column
df

.. note::
If a column or index contains an unparsable date, the entire column or
Expand All @@ -880,10 +811,6 @@ data columns:
for your data to store datetimes in this format, load times will be
significantly faster, ~20x has been observed.

.. deprecated:: 2.2.0
Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime``
on the relevant result columns instead.


Date parsing functions
++++++++++++++++++++++
Expand All @@ -899,12 +826,6 @@ Performance-wise, you should try these methods of parsing dates in order:
then use ``to_datetime``.


.. ipython:: python
:suppress:
os.remove("tmp.csv")
.. _io.csv.mixed_timezones:

Parsing a CSV with mixed timezones
Expand Down
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,10 @@ Removal of prior version deprecations/changes
- Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`)
- Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`)
- Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`)
- Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`)
- Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`)
- Enforced deprecation of argument ``infer_datetime_format`` in :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
- Enforced deprecation of combining parsed datetime columns in :func:`read_csv` in ``parse_dates`` (:issue:`55569`)
- Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`)
- Enforced deprecation of parsing system timezone strings to ``tzlocal``, which depended on system timezone, pass the 'tz' keyword instead (:issue:`50791`)
- Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`)
Expand Down Expand Up @@ -382,6 +384,8 @@ Datetimelike
- Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`)
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`)
- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)

Timedelta
Expand Down Expand Up @@ -419,7 +423,6 @@ Interval
Indexing
^^^^^^^^
- Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`)
- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
-

Missing
Expand Down
5 changes: 0 additions & 5 deletions pandas/_libs/dtypes.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,3 @@ ctypedef fused numeric_t:
ctypedef fused numeric_object_t:
numeric_t
object

ctypedef fused uint8_int64_object_t:
uint8_t
int64_t
object
70 changes: 29 additions & 41 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ from numpy cimport (
PyArray_ITER_DATA,
PyArray_ITER_NEXT,
PyArray_IterNew,
PyArray_SETITEM,
complex128_t,
flatiter,
float64_t,
Expand All @@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h":
PandasParser_IMPORT

from pandas._libs cimport util
from pandas._libs.dtypes cimport uint8_int64_object_t
from pandas._libs.util cimport (
INT64_MAX,
INT64_MIN,
Expand Down Expand Up @@ -2845,14 +2845,16 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value.
NoDefault = Literal[_NoDefault.no_default]


@cython.boundscheck(False)
@cython.wraparound(False)
def map_infer_mask(
ndarray[object] arr,
object f,
const uint8_t[:] mask,
*,
bint convert=True,
object na_value=no_default,
cnp.dtype dtype=np.dtype(object)
ndarray arr,
object f,
const uint8_t[:] mask,
*,
bint convert=True,
object na_value=no_default,
cnp.dtype dtype=np.dtype(object)
) -> "ArrayLike":
"""
Substitute for np.vectorize with pandas-friendly dtype inference.
Expand All @@ -2875,53 +2877,39 @@ def map_infer_mask(
-------
np.ndarray or an ExtensionArray
"""
cdef Py_ssize_t n = len(arr)
result = np.empty(n, dtype=dtype)

_map_infer_mask(
result,
arr,
f,
mask,
na_value,
)
if convert:
return maybe_convert_objects(result)
else:
return result


@cython.boundscheck(False)
@cython.wraparound(False)
def _map_infer_mask(
ndarray[uint8_int64_object_t] out,
ndarray[object] arr,
object f,
const uint8_t[:] mask,
object na_value=no_default,
) -> None:
"""
Helper for map_infer_mask, split off to use fused types based on the result.
"""
cdef:
Py_ssize_t i, n
Py_ssize_t i
Py_ssize_t n = len(arr)
object val

n = len(arr)
ndarray result = np.empty(n, dtype=dtype)

flatiter arr_it = PyArray_IterNew(arr)
flatiter result_it = PyArray_IterNew(result)

for i in range(n):
if mask[i]:
if na_value is no_default:
val = arr[i]
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
else:
val = na_value
else:
val = f(arr[i])
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
val = f(val)

if cnp.PyArray_IsZeroDim(val):
# unbox 0-dim arrays, GH#690
val = val.item()

out[i] = val
PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val)

PyArray_ITER_NEXT(arr_it)
PyArray_ITER_NEXT(result_it)

if convert:
return maybe_convert_objects(result)
else:
return result


@cython.boundscheck(False)
Expand Down
3 changes: 0 additions & 3 deletions pandas/_libs/tslibs/parsing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,4 @@ def guess_datetime_format(
dt_str: str,
dayfirst: bool | None = ...,
) -> str | None: ...
def concat_date_cols(
date_cols: tuple,
) -> npt.NDArray[np.object_]: ...
def get_rule_month(source: str) -> str: ...
Loading

0 comments on commit 9f88ab3

Please sign in to comment.