From d2d179770da39e67aceb7a5c0e11b5a013f5f33b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 18 Jan 2023 11:25:08 -0800 Subject: [PATCH] PERF: Speed up Period construction (#50149) * PERF: Speed up Period construction * Try to fix CI? * Avoid hackiness * debug CI * Modify condition * revert whitespace * fix tests --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/tslibs/parsing.pyx | 10 ++++++---- pandas/_libs/tslibs/period.pyx | 13 ++++--------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ba3316df1fb9e..a8a7e89e789f5 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -868,6 +868,7 @@ Performance improvements - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) - Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) - Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`) +- Performance improvement in :class:`Period` constructor when constructing from a string or integer (:issue:`38312`) - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 1b81d53c09e7e..98667436915f3 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -377,7 +377,11 @@ def parse_datetime_string_with_reso( &out_tzoffset, False ) if not string_to_dts_failed: - if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns or out_local: + timestamp_units = {NPY_DATETIMEUNIT.NPY_FR_ns, + NPY_DATETIMEUNIT.NPY_FR_ps, + NPY_DATETIMEUNIT.NPY_FR_fs, + NPY_DATETIMEUNIT.NPY_FR_as} + if out_bestunit in timestamp_units or out_local: # TODO: the not-out_local case we could do without Timestamp; # avoid circular import from pandas import Timestamp @@ -389,9 +393,7 @@ def parse_datetime_string_with_reso( # Match Timestamp and drop picoseconds, femtoseconds, attoseconds # The new resolution will just be nano # GH 50417 - if out_bestunit in {NPY_DATETIMEUNIT.NPY_FR_ps, - NPY_DATETIMEUNIT.NPY_FR_fs, - NPY_DATETIMEUNIT.NPY_FR_as}: + if out_bestunit in timestamp_units: out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns reso = { NPY_DATETIMEUNIT.NPY_FR_Y: "year", diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 64bd76adb0ae2..333728ad1198d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2592,18 +2592,13 @@ class Period(_Period): freqstr = freq.rule_code if freq is not None else None dt, reso = parse_datetime_string_with_reso(value, freqstr) - try: - ts = Timestamp(value) - except ValueError: - nanosecond = 0 - else: - nanosecond = ts.nanosecond - if nanosecond != 0: - reso = "nanosecond" + if reso == "nanosecond": + nanosecond = dt.nanosecond if dt is NaT: ordinal = NPY_NAT - if freq is None: + if freq is None and ordinal != NPY_NAT: + # Skip NaT, since it doesn't have a resolution try: freq = attrname_to_abbrevs[reso] except KeyError: