From a2c7f8814c1ca0c8929531bc9ad641e047aa9aa3 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 9 Dec 2022 08:19:18 -0500 Subject: [PATCH 1/7] PERF: Speed up Period construction --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/tslibs/period.pyx | 15 +++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d8609737b8c7a..4f0b46909b8a3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -636,6 +636,7 @@ Performance improvements - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) - Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) - Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`) +- Performance improvement in :class:`Period` constructor when constructing from a string or integer (:issue:`38312`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index cc9c2d631bcd9..b86435b7a0c31 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2574,16 +2574,15 @@ class Period(_Period): value = str(value) value = value.upper() dt, reso = parse_time_string(value, freq) - try: - ts = Timestamp(value) - except ValueError: - nanosecond = 0 - else: - nanosecond = ts.nanosecond - if nanosecond != 0: - reso = "nanosecond" + if reso == "nanosecond": + nanosecond = dt.nanosecond + if dt is NaT: ordinal = NPY_NAT + # Doesn't matter what this is, we just need to have it + # so that we don't error in block below. We get converted + # to NaT later on anyways + reso = "nanosecond" if freq is None: try: From 4c351ad7b5d764947f463874112a1c76a986dec7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 9 Dec 2022 20:36:43 -0500 Subject: [PATCH 2/7] Try to fix CI? --- pandas/_libs/tslibs/parsing.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 44d06df53e0be..88540a96f4b85 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -421,6 +421,10 @@ cdef parse_datetime_string_with_reso( parsed = datetime( dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us ) + if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: + # No picoseconds, so no nanoseconds. + # Have to have seen microseconds, in order to have "seen" nanoseconds + out_bestunit = NPY_DATETIMEUNIT.NPY_FR_us reso = { NPY_DATETIMEUNIT.NPY_FR_Y: "year", NPY_DATETIMEUNIT.NPY_FR_M: "month", From a2a31c6ca477676c3bff6234dffbdfff4a87c545 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 12 Dec 2022 12:08:25 -0500 Subject: [PATCH 3/7] Avoid hackiness --- pandas/_libs/tslibs/period.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index de3ead089c9fe..6bd61e657d7a7 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2579,12 +2579,9 @@ class Period(_Period): if dt is NaT: ordinal = NPY_NAT - # Doesn't matter what this is, we just need to have it - # so that we don't error in block below. We get converted - # to NaT later on anyways - reso = "nanosecond" - if freq is None: + if freq is None and ordinal != NPY_NAT: + # Skip NaT, since it doesn't have a resolution try: freq = attrname_to_abbrevs[reso] except KeyError: From 5df970cb5416a0388d66450d6cd1708fc37a3bc9 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 19 Dec 2022 20:25:30 -0500 Subject: [PATCH 4/7] debug CI --- pandas/_libs/tslibs/parsing.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 88540a96f4b85..af6806d1c3cb8 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -421,10 +421,10 @@ cdef parse_datetime_string_with_reso( parsed = datetime( dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us ) - if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: - # No picoseconds, so no nanoseconds. - # Have to have seen microseconds, in order to have "seen" nanoseconds - out_bestunit = NPY_DATETIMEUNIT.NPY_FR_us + # if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: + # # No picoseconds, so no nanoseconds. + # # Have to have seen microseconds, in order to have "seen" nanoseconds + # out_bestunit = NPY_DATETIMEUNIT.NPY_FR_us reso = { NPY_DATETIMEUNIT.NPY_FR_Y: "year", NPY_DATETIMEUNIT.NPY_FR_M: "month", From 29c3e80bf2046e08c2d81c5c4e300855dbb81417 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 20 Dec 2022 19:28:52 -0500 Subject: [PATCH 5/7] Modify condition --- pandas/_libs/tslibs/parsing.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 61a93d4757893..3876623c103f2 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -386,7 +386,7 @@ cdef parse_datetime_string_with_reso( &out_tzoffset, False ) if not string_to_dts_failed: - if dts.ps != 0 or out_local: + if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns or out_local: # TODO: the not-out_local case we could do without Timestamp; # avoid circular import from pandas import Timestamp @@ -395,10 +395,7 @@ cdef parse_datetime_string_with_reso( parsed = datetime( dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us ) - # if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: - # # No picoseconds, so no nanoseconds. - # # Have to have seen microseconds, in order to have "seen" nanoseconds - # out_bestunit = NPY_DATETIMEUNIT.NPY_FR_us + reso = { NPY_DATETIMEUNIT.NPY_FR_Y: "year", NPY_DATETIMEUNIT.NPY_FR_M: "month", From f8a85a7541dca019c87ea6df4ac9c4ee438ee95c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 16 Jan 2023 12:04:01 -0800 Subject: [PATCH 6/7] revert whitespace --- pandas/_libs/tslibs/period.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 642585dccc573..333728ad1198d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2589,6 +2589,7 @@ class Period(_Period): value = str(value) value = value.upper() + freqstr = freq.rule_code if freq is not None else None dt, reso = parse_datetime_string_with_reso(value, freqstr) if reso == "nanosecond": From c18b3b38a0fcc8ab0ea0366cff8543dc663550fa Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 17 Jan 2023 15:08:58 -0800 Subject: [PATCH 7/7] fix tests --- pandas/_libs/tslibs/parsing.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 9e593ec64f7d2..71c7a0ec86b0b 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -377,7 +377,11 @@ def parse_datetime_string_with_reso( &out_tzoffset, False ) if not string_to_dts_failed: - if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns or out_local: + timestamp_units = {NPY_DATETIMEUNIT.NPY_FR_ns, + NPY_DATETIMEUNIT.NPY_FR_ps, + NPY_DATETIMEUNIT.NPY_FR_fs, + NPY_DATETIMEUNIT.NPY_FR_as} + if out_bestunit in timestamp_units or out_local: # TODO: the not-out_local case we could do without Timestamp; # avoid circular import from pandas import Timestamp @@ -389,9 +393,7 @@ def parse_datetime_string_with_reso( # Match Timestamp and drop picoseconds, femtoseconds, attoseconds # The new resolution will just be nano # GH 50417 - if out_bestunit in {NPY_DATETIMEUNIT.NPY_FR_ps, - NPY_DATETIMEUNIT.NPY_FR_fs, - NPY_DATETIMEUNIT.NPY_FR_as}: + if out_bestunit in timestamp_units: out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns reso = { NPY_DATETIMEUNIT.NPY_FR_Y: "year",