Skip to content

Commit

Permalink
Move FrequencyInferer out of libresolution (#21992)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and jreback committed Jul 25, 2018
1 parent 2d0c961 commit e0b81d4
Show file tree
Hide file tree
Showing 3 changed files with 304 additions and 291 deletions.
41 changes: 41 additions & 0 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ cnp.import_array()
cimport util
from util cimport numeric, get_nat

from khash cimport (khiter_t,
kh_destroy_int64, kh_put_int64,
kh_init_int64, kh_int64_t,
kh_resize_int64, kh_get_int64)

import missing

cdef float64_t FP_ERR = 1e-13
Expand Down Expand Up @@ -71,6 +76,42 @@ class NegInfinity(object):
__ge__ = lambda self, other: isinstance(other, NegInfinity)


cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
"""
Efficiently find the unique first-differences of the given array.
Parameters
----------
arr : ndarray[in64_t]
Returns
-------
result : ndarray[int64_t]
result is sorted
"""
cdef:
Py_ssize_t i, n = len(arr)
int64_t val
khiter_t k
kh_int64_t *table
int ret = 0
list uniques = []

table = kh_init_int64()
kh_resize_int64(table, 10)
for i in range(n - 1):
val = arr[i + 1] - arr[i]
k = kh_get_int64(table, val)
if k == table.n_buckets:
kh_put_int64(table, val, &ret)
uniques.append(val)
kh_destroy_int64(table)

result = np.array(uniques, dtype=np.int64)
result.sort()
return result


@cython.wraparound(False)
@cython.boundscheck(False)
def is_lexsorted(list list_of_arrays):
Expand Down
289 changes: 2 additions & 287 deletions pandas/_libs/tslibs/resolution.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
# cython: profile=False

cimport cython
from cython cimport Py_ssize_t

import numpy as np
Expand All @@ -10,23 +11,12 @@ cnp.import_array()

from util cimport is_string_object, get_nat

from pandas._libs.khash cimport (khiter_t,
kh_destroy_int64, kh_put_int64,
kh_init_int64, kh_int64_t,
kh_resize_int64, kh_get_int64)

from np_datetime cimport npy_datetimestruct, dt64_to_dtstruct
from frequencies cimport get_freq_code
from timezones cimport (is_utc, is_tzlocal,
maybe_get_tz, get_dst_info)
from fields import build_field_sarray
from conversion import tz_convert
from conversion cimport tz_convert_utc_to_tzlocal
from ccalendar import MONTH_ALIASES, int_to_weekday
from ccalendar cimport get_days_in_month
from timestamps import Timestamp

from pandas._libs.properties import cache_readonly

# ----------------------------------------------------------------------
# Constants
Expand All @@ -41,13 +31,6 @@ cdef int RESO_MIN = 4
cdef int RESO_HR = 5
cdef int RESO_DAY = 6

_ONE_MICRO = <int64_t>1000L
_ONE_MILLI = <int64_t>(_ONE_MICRO * 1000)
_ONE_SECOND = <int64_t>(_ONE_MILLI * 1000)
_ONE_MINUTE = <int64_t>(60 * _ONE_SECOND)
_ONE_HOUR = <int64_t>(60 * _ONE_MINUTE)
_ONE_DAY = <int64_t>(24 * _ONE_HOUR)

# ----------------------------------------------------------------------

cpdef resolution(ndarray[int64_t] stamps, tz=None):
Expand Down Expand Up @@ -331,31 +314,7 @@ class Resolution(object):
# ----------------------------------------------------------------------
# Frequency Inference

cdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
cdef:
Py_ssize_t i, n = len(arr)
int64_t val
khiter_t k
kh_int64_t *table
int ret = 0
list uniques = []

table = kh_init_int64()
kh_resize_int64(table, 10)
for i in range(n - 1):
val = arr[i + 1] - arr[i]
k = kh_get_int64(table, val)
if k == table.n_buckets:
kh_put_int64(table, val, &ret)
uniques.append(val)
kh_destroy_int64(table)

result = np.array(uniques, dtype=np.int64)
result.sort()
return result


cdef object month_position_check(fields, weekdays):
def month_position_check(fields, weekdays):
cdef:
int32_t daysinmonth, y, m, d
bint calendar_end = True
Expand Down Expand Up @@ -397,247 +356,3 @@ cdef object month_position_check(fields, weekdays):
return 'bs'
else:
return None


cdef inline bint _is_multiple(int64_t us, int64_t mult):
return us % mult == 0


cdef inline str _maybe_add_count(str base, int64_t count):
if count != 1:
return '{count}{base}'.format(count=count, base=base)
else:
return base


cdef class _FrequencyInferer(object):
"""
Not sure if I can avoid the state machine here
"""
cdef public:
object index
object values
bint warn
bint is_monotonic
dict _cache

def __init__(self, index, warn=True):
self.index = index
self.values = np.asarray(index).view('i8')

# This moves the values, which are implicitly in UTC, to the
# the timezone so they are in local time
if hasattr(index, 'tz'):
if index.tz is not None:
self.values = tz_convert(self.values, 'UTC', index.tz)

self.warn = warn

if len(index) < 3:
raise ValueError('Need at least 3 dates to infer frequency')

self.is_monotonic = (self.index.is_monotonic_increasing or
self.index.is_monotonic_decreasing)

@cache_readonly
def deltas(self):
return unique_deltas(self.values)

@cache_readonly
def deltas_asi8(self):
return unique_deltas(self.index.asi8)

@cache_readonly
def is_unique(self):
return len(self.deltas) == 1

@cache_readonly
def is_unique_asi8(self):
return len(self.deltas_asi8) == 1

def get_freq(self):
if not self.is_monotonic or not self.index.is_unique:
return None

delta = self.deltas[0]
if _is_multiple(delta, _ONE_DAY):
return self._infer_daily_rule()
else:
# Business hourly, maybe. 17: one day / 65: one weekend
if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
return 'BH'
# Possibly intraday frequency. Here we use the
# original .asi8 values as the modified values
# will not work around DST transitions. See #8772
elif not self.is_unique_asi8:
return None
delta = self.deltas_asi8[0]
if _is_multiple(delta, _ONE_HOUR):
# Hours
return _maybe_add_count('H', delta / _ONE_HOUR)
elif _is_multiple(delta, _ONE_MINUTE):
# Minutes
return _maybe_add_count('T', delta / _ONE_MINUTE)
elif _is_multiple(delta, _ONE_SECOND):
# Seconds
return _maybe_add_count('S', delta / _ONE_SECOND)
elif _is_multiple(delta, _ONE_MILLI):
# Milliseconds
return _maybe_add_count('L', delta / _ONE_MILLI)
elif _is_multiple(delta, _ONE_MICRO):
# Microseconds
return _maybe_add_count('U', delta / _ONE_MICRO)
else:
# Nanoseconds
return _maybe_add_count('N', delta)

@cache_readonly
def day_deltas(self):
return [x / _ONE_DAY for x in self.deltas]

@cache_readonly
def hour_deltas(self):
return [x / _ONE_HOUR for x in self.deltas]

@cache_readonly
def fields(self):
return build_field_sarray(self.values)

@cache_readonly
def rep_stamp(self):
return Timestamp(self.values[0])

cdef object month_position_check(self):
return month_position_check(self.fields, self.index.dayofweek)

@cache_readonly
def mdiffs(self):
nmonths = self.fields['Y'] * 12 + self.fields['M']
return unique_deltas(nmonths.astype('i8'))

@cache_readonly
def ydiffs(self):
return unique_deltas(self.fields['Y'].astype('i8'))

cdef _infer_daily_rule(self):
annual_rule = self._get_annual_rule()
if annual_rule:
nyears = self.ydiffs[0]
month = MONTH_ALIASES[self.rep_stamp.month]
alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month)
return _maybe_add_count(alias, nyears)

quarterly_rule = self._get_quarterly_rule()
if quarterly_rule:
nquarters = self.mdiffs[0] / 3
mod_dict = {0: 12, 2: 11, 1: 10}
month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
alias = '{prefix}-{month}'.format(prefix=quarterly_rule,
month=month)
return _maybe_add_count(alias, nquarters)

monthly_rule = self._get_monthly_rule()
if monthly_rule:
return _maybe_add_count(monthly_rule, self.mdiffs[0])

if self.is_unique:
days = self.deltas[0] / _ONE_DAY
if days % 7 == 0:
# Weekly
day = int_to_weekday[self.rep_stamp.weekday()]
return _maybe_add_count('W-{day}'.format(day=day), days / 7)
else:
return _maybe_add_count('D', days)

if self._is_business_daily():
return 'B'

wom_rule = self._get_wom_rule()
if wom_rule:
return wom_rule

cdef _get_annual_rule(self):
if len(self.ydiffs) > 1:
return None

# lazy import to prevent circularity
# TODO: Avoid non-cython dependency
from pandas.core.algorithms import unique

if len(unique(self.fields['M'])) > 1:
return None

pos_check = self.month_position_check()
return {'cs': 'AS', 'bs': 'BAS',
'ce': 'A', 'be': 'BA'}.get(pos_check)

cdef _get_quarterly_rule(self):
if len(self.mdiffs) > 1:
return None

if not self.mdiffs[0] % 3 == 0:
return None

pos_check = self.month_position_check()
return {'cs': 'QS', 'bs': 'BQS',
'ce': 'Q', 'be': 'BQ'}.get(pos_check)

cdef _get_monthly_rule(self):
if len(self.mdiffs) > 1:
return None
pos_check = self.month_position_check()
return {'cs': 'MS', 'bs': 'BMS',
'ce': 'M', 'be': 'BM'}.get(pos_check)

cdef bint _is_business_daily(self):
# quick check: cannot be business daily
if self.day_deltas != [1, 3]:
return False

# probably business daily, but need to confirm
first_weekday = self.index[0].weekday()
shifts = np.diff(self.index.asi8)
shifts = np.floor_divide(shifts, _ONE_DAY)
weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
return np.all(((weekdays == 0) & (shifts == 3)) |
((weekdays > 0) & (weekdays <= 4) & (shifts == 1)))

cdef _get_wom_rule(self):
# wdiffs = unique(np.diff(self.index.week))
# We also need -47, -49, -48 to catch index spanning year boundary
# if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
# return None

# lazy import to prevent circularity
# TODO: Avoid non-cython dependency
from pandas.core.algorithms import unique

weekdays = unique(self.index.weekday)
if len(weekdays) > 1:
return None

week_of_months = unique((self.index.day - 1) // 7)
# Only attempt to infer up to WOM-4. See #9425
week_of_months = week_of_months[week_of_months < 4]
if len(week_of_months) == 0 or len(week_of_months) > 1:
return None

# get which week
week = week_of_months[0] + 1
wd = int_to_weekday[weekdays[0]]

return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)


cdef class _TimedeltaFrequencyInferer(_FrequencyInferer):

cdef _infer_daily_rule(self):
if self.is_unique:
days = self.deltas[0] / _ONE_DAY
if days % 7 == 0:
# Weekly
wd = int_to_weekday[self.rep_stamp.weekday()]
alias = 'W-{weekday}'.format(weekday=wd)
return _maybe_add_count(alias, days / 7)
else:
return _maybe_add_count('D', days)
Loading

0 comments on commit e0b81d4

Please sign in to comment.