## <strong> 12. 시계열 데이터 생성 및 조작 </strong>

### <strong> Python 내장 모듈을 이용한 Datetime 생성
---

#### ```datetime```, ```dateutil```: 시계열 처리를 위한 Python 내장 모듈

In [1]:
from datetime import datetime

# datetime 객체 생성
t = datetime(year=2023, month=11, day=7)
t

datetime.datetime(2023, 11, 7, 0, 0)

In [2]:
# datetime 속성 접근: year
t.year

2023

In [3]:
# 문자열 파싱을 통한 날짜 해석
from dateutil import parser

date = parser.parse("7th of November, 2023")
date

datetime.datetime(2023, 11, 7, 0, 0)

#### ```strftime()```: datetime의 속성을 문자열 형태로 출력

In [4]:
# [+] 요일 출력
date.strftime('%A')

'Tuesday'

In [6]:
# [+] 월 출력
date.strftime('%B') 

'November'

### <strong> NumPy를 이용한 Datetime 배열 생성</strong>
---
```datetime64```: ```NumPy```에서 지원하는 datetime 클래스

In [7]:
import numpy as np

# datetime64 배열 객체 생성
dates = np.array(['2023-11-07', '2023-11-08'], dtype=np.datetime64)
dates

array(['2023-11-07', '2023-11-08'], dtype='datetime64[D]')

In [8]:
# datetime64() 메서드를 이용한 객체 생성
date = np.datetime64('2023-11-07')
date

numpy.datetime64('2023-11-07')

In [9]:
# 벡터화 연산을 통한 Datetime 배열 생성
date + np.arange(7) # [0, 1, .. 6]

array(['2023-11-07', '2023-11-08', '2023-11-09', '2023-11-10',
       '2023-11-11', '2023-11-12', '2023-11-13'], dtype='datetime64[D]')

In [10]:
# 분(minute) 단위의 Datetime 객체 생성
t = np.datetime64('2023-11-07 09:30')
t

numpy.datetime64('2023-11-07T09:30')

In [11]:
# 디폴트 시간 빈도: ms
t = np.datetime64('2023-11-07 09:44:59.99')
t

numpy.datetime64('2023-11-07T09:44:59.990')

In [12]:
# 시간 빈도를 Nanosecond로 설정
t = np.datetime64('2023-11-07 09:44:59.99', 'ns')
t

numpy.datetime64('2023-11-07T09:44:59.990000000')

In [13]:
# datetime64 코드: Y, M, D, h...
t = np.datetime64('2022-10-18 11:39:10.20', 'Y')
t

numpy.datetime64('2022')

### <strong> Pandas에서의 시계열 처리</strong>
---

In [14]:
import pandas as pd

# to_datetime() 파싱 함수를 이용한 Timestamp 객체 생성
date = pd.to_datetime("7th of November, 2023")
date

Timestamp('2023-11-07 00:00:00')

In [15]:
# 요일 출력
date.strftime('%A')

'Tuesday'

In [16]:
# NumPy 스타일의 배열 연산
date + pd.to_timedelta(np.arange(7))

DatetimeIndex([          '2023-11-07 00:00:00',
               '2023-11-07 00:00:00.000000001',
               '2023-11-07 00:00:00.000000002',
               '2023-11-07 00:00:00.000000003',
               '2023-11-07 00:00:00.000000004',
               '2023-11-07 00:00:00.000000005',
               '2023-11-07 00:00:00.000000006'],
              dtype='datetime64[ns]', freq=None)

In [17]:
date + pd.to_timedelta(np.arange(7), unit='D')

DatetimeIndex(['2023-11-07', '2023-11-08', '2023-11-09', '2023-11-10',
               '2023-11-11', '2023-11-12', '2023-11-13'],
              dtype='datetime64[ns]', freq=None)

#### <strong> 시계열 생성

In [19]:
# DatetimeIndex 객체 생성
ind = pd.DatetimeIndex(['2022-10-27', '2022-10-28', '2022-10-29', '2022-10-30',
                        '2022-10-31', '2022-11-01', '2022-11-02'])
ind

DatetimeIndex(['2022-10-27', '2022-10-28', '2022-10-29', '2022-10-30',
               '2022-10-31', '2022-11-01', '2022-11-02'],
              dtype='datetime64[ns]', freq=None)

In [20]:
# [+] 시계열 인덱싱 적용
ser = pd.Series([1, 2, 3, 4, 5, 6, 7], index=ind)
ser

2022-10-27    1
2022-10-28    2
2022-10-29    3
2022-10-30    4
2022-10-31    5
2022-11-01    6
2022-11-02    7
dtype: int64

In [21]:
ser.index

DatetimeIndex(['2022-10-27', '2022-10-28', '2022-10-29', '2022-10-30',
               '2022-10-31', '2022-11-01', '2022-11-02'],
              dtype='datetime64[ns]', freq=None)

In [22]:
# [+] 인덱싱
ser['2022-10-28']

2

In [23]:
# [+] 슬라이싱
ser['2022-10-28':'2022-10-30']

2022-10-28    2
2022-10-29    3
2022-10-30    4
dtype: int64

In [24]:
# [+] 데이터 선택
ser[2:5]

2022-10-29    3
2022-10-30    4
2022-10-31    5
dtype: int64

#### <strong> 시계열 관련 Pandas의 데이터 구조

In [25]:
# Timestamp와 DatetimeIndex
dates = pd.to_datetime([datetime(2015, 7, 3),
                      "4th of July, 2015",
                      '2015-Jul-6',
                      '07-07-2015',
                      '20150708'])
print(dates)

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)


In [26]:
# to_period(): DatetimeIndex -> PeriodIndex 변환
dates.to_period('D')
dates.to_period('W')

PeriodIndex(['2015-06-29/2015-07-05', '2015-06-29/2015-07-05',
             '2015-07-06/2015-07-12', '2015-07-06/2015-07-12',
             '2015-07-06/2015-07-12'],
            dtype='period[W-SUN]')

In [27]:
# [+] TimedeltaIndex 생성
time_delta = pd.to_timedelta([1, 2, 3], unit='D')
time_delta

TimedeltaIndex(['1 days', '2 days', '3 days'], dtype='timedelta64[ns]', freq=None)

#### <strong> 정규 시퀀스 </strong>

In [28]:
# [+] pd.date_range()를 이용한 정규 시퀀스 생성
pd.date_range('2022-10-27', periods=7, freq='D')

DatetimeIndex(['2022-10-27', '2022-10-28', '2022-10-29', '2022-10-30',
               '2022-10-31', '2022-11-01', '2022-11-02'],
              dtype='datetime64[ns]', freq='D')

In [29]:
# [+] Period 값을 이용
pd.period_range('2022-10', periods=3, freq='M')

PeriodIndex(['2022-10', '2022-11', '2022-12'], dtype='period[M]')

In [30]:
# 빈도 변경
pd.date_range('2022-10-27', periods=12, freq='H')

DatetimeIndex(['2022-10-27 00:00:00', '2022-10-27 01:00:00',
               '2022-10-27 02:00:00', '2022-10-27 03:00:00',
               '2022-10-27 04:00:00', '2022-10-27 05:00:00',
               '2022-10-27 06:00:00', '2022-10-27 07:00:00',
               '2022-10-27 08:00:00', '2022-10-27 09:00:00',
               '2022-10-27 10:00:00', '2022-10-27 11:00:00'],
              dtype='datetime64[ns]', freq='H')

In [31]:
# pd.period_range()
pd.period_range('2015-07', periods=8, freq='M')

PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
             '2016-01', '2016-02'],
            dtype='period[M]')

In [32]:
# Timestamp vs. Period
period = pd.Period('2022-10-27')
timestamp = pd.Timestamp('2022-10-27 09:30')

print(period.start_time < timestamp < period.end_time)

print(period.start_time)

print(period.end_time)

True
2022-10-27 00:00:00
2022-10-27 23:59:59.999999999


In [33]:
# pd.timedelta_range()
pd.timedelta_range(0, periods=10, freq='H')

TimedeltaIndex(['0 days 00:00:00', '0 days 01:00:00', '0 days 02:00:00',
                '0 days 03:00:00', '0 days 04:00:00', '0 days 05:00:00',
                '0 days 06:00:00', '0 days 07:00:00', '0 days 08:00:00',
                '0 days 09:00:00'],
               dtype='timedelta64[ns]', freq='H')

| Code   | Description         | Code   | Description          |
|--------|---------------------|--------|----------------------|
| ``D``  | Calendar day        | ``B``  | Business day         |
| ``W``  | Weekly              |   -    |                      |
| ``M``  | Month end           | ``BM`` | Business month end   |
| ``Q``  | Quarter end         | ``BQ`` | Business quarter end |
| ``A``  | Year end            | ``BA`` | Business year end    |
| ``H``  | Hours               | ``BH`` | Business hours       |
| ``T``  | Minutes             |   -    |                      |
| ``S``  | Seconds             |   -    |                      |
| ``L``  | Milliseonds         |   -    |                      |
| ``U``  | Microseconds        |   -    |                      |
| ``N``  | nanoseconds         |   -    |                      |

#### <strong> 빈도 및 오프셋

In [34]:
# 빈도 코드
pd.date_range('2022-11-01', periods=10, freq='H')

DatetimeIndex(['2022-11-01 00:00:00', '2022-11-01 01:00:00',
               '2022-11-01 02:00:00', '2022-11-01 03:00:00',
               '2022-11-01 04:00:00', '2022-11-01 05:00:00',
               '2022-11-01 06:00:00', '2022-11-01 07:00:00',
               '2022-11-01 08:00:00', '2022-11-01 09:00:00'],
              dtype='datetime64[ns]', freq='H')

In [35]:
# 빈도, M: 월말
(pd.date_range('2022-11-01', periods=10, freq="M"))

DatetimeIndex(['2022-11-30', '2022-12-31', '2023-01-31', '2023-02-28',
               '2023-03-31', '2023-04-30', '2023-05-31', '2023-06-30',
               '2023-07-31', '2023-08-31'],
              dtype='datetime64[ns]', freq='M')

In [36]:
# 빈도, BM: 비즈니스 기준 월말
pd.date_range('2022-11-01', periods=10, freq="BM")

DatetimeIndex(['2022-11-30', '2022-12-30', '2023-01-31', '2023-02-28',
               '2023-03-31', '2023-04-28', '2023-05-31', '2023-06-30',
               '2023-07-31', '2023-08-31'],
              dtype='datetime64[ns]', freq='BM')

In [37]:
# 접미사 'S': 시작 일시를 기준으로 시퀀스 생성
pd.date_range('2022-11-01', periods=10, freq='MS')

DatetimeIndex(['2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01',
               '2023-03-01', '2023-04-01', '2023-05-01', '2023-06-01',
               '2023-07-01', '2023-08-01'],
              dtype='datetime64[ns]', freq='MS')

In [38]:
# 빈도 코드 조합
pd.date_range('2022-11-01', periods=9, freq='2H30T')

DatetimeIndex(['2022-11-01 00:00:00', '2022-11-01 02:30:00',
               '2022-11-01 05:00:00', '2022-11-01 07:30:00',
               '2022-11-01 10:00:00', '2022-11-01 12:30:00',
               '2022-11-01 15:00:00', '2022-11-01 17:30:00',
               '2022-11-01 20:00:00'],
              dtype='datetime64[ns]', freq='150T')

In [39]:
# 오프셋
from pandas.tseries.offsets import DateOffset
ts = pd.Timestamp('2023-11-07 09:30:00')
ts + DateOffset(hours=40)

Timestamp('2023-11-09 01:30:00')

In [40]:
# 오프셋(minus)
ts - DateOffset(years=73, months=4, days=12, hours=5, minutes=10, seconds=00)

Timestamp('1950-06-25 04:20:00')