In [None]:
import pandas as pd

data = pd.read_csv("/content/temperatures.csv")

In [None]:
df = data.copy()

In [None]:
df.head()

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8


In [None]:
df.info() #DataFrame형태임

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650 entries, 0 to 3649
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    3650 non-null   object 
 1   Temp    3650 non-null   float64
dtypes: float64(1), object(1)
memory usage: 57.2+ KB


## **문자형을 날짜형으로 변경**
- 날짜가 문자형으로 되어있다면 날짜형으로 변경해야 여러가지 날짜 계산을 할 수 있음
- 다양한 형식의 날짜를 한꺼번에 처리할 수 있음("2024-10-13", "2024/10/14", "13-10-2024")
- `pd.to_datetime(컬럼, format='날짜 형식')`

|형식|설명|
|:--|:--|
|%Y|0을 채운 4자리 연도|
|%y|0을 채운 2자리 연도|
|%m|0을 채운 월|
|%d|0을 채운 일|
|%H|0을 채운 시간|
|%M|0을 채운 분|
|%S|0을 채운 초|

In [None]:
pd.to_datetime(df['Date'], format='%Y-%m-%d')

Unnamed: 0,Date
0,1981-01-01
1,1981-01-02
2,1981-01-03
3,1981-01-04
4,1981-01-05
...,...
3645,1990-12-27
3646,1990-12-28
3647,1990-12-29
3648,1990-12-30


In [None]:
df['Date1'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
df.info() #Dtype이 datetime이 됨

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650 entries, 0 to 3649
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3650 non-null   object        
 1   Temp    3650 non-null   float64       
 2   Date1   3650 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 85.7+ KB


In [None]:
df.head()

Unnamed: 0,Date,Temp,Date1
0,1981-01-01,20.7,1981-01-01
1,1981-01-02,17.9,1981-01-02
2,1981-01-03,18.8,1981-01-03
3,1981-01-04,14.6,1981-01-04
4,1981-01-05,15.8,1981-01-05


## **날짜를 원하는 형식으로 변경**
- 변환할 날짜의 형식을 내가 명확히 지정해야 함. 주로 하나의 문자열을 변환할 때 사용\
`데이터컬럼.dt.strftime(날짜형식)`


In [None]:
df['Date1'].dt.strftime('%Y-%m')

Unnamed: 0,Date1
0,1981-01
1,1981-01
2,1981-01
3,1981-01
4,1981-01
...,...
3645,1990-12
3646,1990-12
3647,1990-12
3648,1990-12


In [None]:
df['Date1'].dt.strftime('%m-%d %H:%M')

Unnamed: 0,Date1
0,01-01 00:00
1,01-02 00:00
2,01-03 00:00
3,01-04 00:00
4,01-05 00:00
...,...
3645,12-27 00:00
3646,12-28 00:00
3647,12-29 00:00
3648,12-30 00:00


## **dt 연산자**
|연산자|설명|
|:--|:--|
|year|연도|
|month|월|
|day|일|
|dayofweek|요일(0-월요일, 6-일요일)|
|day_name()|요일을 문자열로|

In [None]:
df['year'] = df['Date1'].dt.year
df['month'] = df['Date1'].dt.month
df['day'] = df['Date1'].dt.day
df['dayofweek'] = df['Date1'].dt.dayofweek
df['dayname'] = df['Date1'].dt.day_name()

In [None]:
#하나하나씩 추출
df.head()

Unnamed: 0,Date,Temp,Date1,year,month,day,dayofweek,dayname
0,1981-01-01,20.7,1981-01-01,1981,1,1,3,Thursday
1,1981-01-02,17.9,1981-01-02,1981,1,2,4,Friday
2,1981-01-03,18.8,1981-01-03,1981,1,3,5,Saturday
3,1981-01-04,14.6,1981-01-04,1981,1,4,6,Sunday
4,1981-01-05,15.8,1981-01-05,1981,1,5,0,Monday


## **날짜 계산**
- day 연산: `pd.Timedelta(day=숫자)`
- month 연산: `DateOffset(months=숫자)`
- year 연산: `DateOffset(years=숫자)`

In [None]:
#day 계산
df['plus day1'] = df['Date1'] + pd.Timedelta(days=1) #days숫자 뒷 값만큼 더함
df.head()

Unnamed: 0,Date,Temp,Date1,year,month,day,dayofweek,dayname,plus day1
0,1981-01-01,20.7,1981-01-01,1981,1,1,3,Thursday,1981-01-02
1,1981-01-02,17.9,1981-01-02,1981,1,2,4,Friday,1981-01-03
2,1981-01-03,18.8,1981-01-03,1981,1,3,5,Saturday,1981-01-04
3,1981-01-04,14.6,1981-01-04,1981,1,4,6,Sunday,1981-01-05
4,1981-01-05,15.8,1981-01-05,1981,1,5,0,Monday,1981-01-06


In [None]:
df['plus day7'] = df['Date1'] + pd.Timedelta(days=7)
df.head()

Unnamed: 0,Date,Temp,Date1,year,month,day,dayofweek,dayname,plus day1,plus day7
0,1981-01-01,20.7,1981-01-01,1981,1,1,3,Thursday,1981-01-02,1981-01-08
1,1981-01-02,17.9,1981-01-02,1981,1,2,4,Friday,1981-01-03,1981-01-09
2,1981-01-03,18.8,1981-01-03,1981,1,3,5,Saturday,1981-01-04,1981-01-10
3,1981-01-04,14.6,1981-01-04,1981,1,4,6,Sunday,1981-01-05,1981-01-11
4,1981-01-05,15.8,1981-01-05,1981,1,5,0,Monday,1981-01-06,1981-01-12


In [None]:
df['minus day7'] = df['Date1'] - pd.Timedelta(days=7)
df.head()

Unnamed: 0,Date,Temp,Date1,year,month,day,dayofweek,dayname,plus day1,plus day7,minus day7
0,1981-01-01,20.7,1981-01-01,1981,1,1,3,Thursday,1981-01-02,1981-01-08,1980-12-25
1,1981-01-02,17.9,1981-01-02,1981,1,2,4,Friday,1981-01-03,1981-01-09,1980-12-26
2,1981-01-03,18.8,1981-01-03,1981,1,3,5,Saturday,1981-01-04,1981-01-10,1980-12-27
3,1981-01-04,14.6,1981-01-04,1981,1,4,6,Sunday,1981-01-05,1981-01-11,1980-12-28
4,1981-01-05,15.8,1981-01-05,1981,1,5,0,Monday,1981-01-06,1981-01-12,1980-12-29


In [None]:
# month계산
from pandas.tseries.offsets import DateOffset
# 시간 오프셋 기능을 제공하는 객체를 불러오는 코드
# DateOffset은 특정 시간 단위를 추가하거나 빼는 데 사용

df['plus month1'] = df['Date1'] + DateOffset(months=1)
df.head()

Unnamed: 0,Date,Temp,Date1,year,month,day,dayofweek,dayname,plus day1,plus day7,minus day7,plus month1
0,1981-01-01,20.7,1981-01-01,1981,1,1,3,Thursday,1981-01-02,1981-01-08,1980-12-25,1981-02-01
1,1981-01-02,17.9,1981-01-02,1981,1,2,4,Friday,1981-01-03,1981-01-09,1980-12-26,1981-02-02
2,1981-01-03,18.8,1981-01-03,1981,1,3,5,Saturday,1981-01-04,1981-01-10,1980-12-27,1981-02-03
3,1981-01-04,14.6,1981-01-04,1981,1,4,6,Sunday,1981-01-05,1981-01-11,1980-12-28,1981-02-04
4,1981-01-05,15.8,1981-01-05,1981,1,5,0,Monday,1981-01-06,1981-01-12,1980-12-29,1981-02-05


In [None]:
df['minus month3'] = df['Date1'] - DateOffset(months=3)
df.head()

Unnamed: 0,Date,Temp,Date1,year,month,day,dayofweek,dayname,plus day1,plus day7,minus day7,plus month1,minus month3
0,1981-01-01,20.7,1981-01-01,1981,1,1,3,Thursday,1981-01-02,1981-01-08,1980-12-25,1981-02-01,1980-10-01
1,1981-01-02,17.9,1981-01-02,1981,1,2,4,Friday,1981-01-03,1981-01-09,1980-12-26,1981-02-02,1980-10-02
2,1981-01-03,18.8,1981-01-03,1981,1,3,5,Saturday,1981-01-04,1981-01-10,1980-12-27,1981-02-03,1980-10-03
3,1981-01-04,14.6,1981-01-04,1981,1,4,6,Sunday,1981-01-05,1981-01-11,1980-12-28,1981-02-04,1980-10-04
4,1981-01-05,15.8,1981-01-05,1981,1,5,0,Monday,1981-01-06,1981-01-12,1980-12-29,1981-02-05,1980-10-05


In [None]:
df['plus year1'] = df['Date1'] + DateOffset(years=1)
df.head()

Unnamed: 0,Date,Temp,Date1,year,month,day,dayofweek,dayname,plus day1,plus day7,minus day7,plus month1,minus month3,plus year1
0,1981-01-01,20.7,1981-01-01,1981,1,1,3,Thursday,1981-01-02,1981-01-08,1980-12-25,1981-02-01,1980-10-01,1982-01-01
1,1981-01-02,17.9,1981-01-02,1981,1,2,4,Friday,1981-01-03,1981-01-09,1980-12-26,1981-02-02,1980-10-02,1982-01-02
2,1981-01-03,18.8,1981-01-03,1981,1,3,5,Saturday,1981-01-04,1981-01-10,1980-12-27,1981-02-03,1980-10-03,1982-01-03
3,1981-01-04,14.6,1981-01-04,1981,1,4,6,Sunday,1981-01-05,1981-01-11,1980-12-28,1981-02-04,1980-10-04,1982-01-04
4,1981-01-05,15.8,1981-01-05,1981,1,5,0,Monday,1981-01-06,1981-01-12,1980-12-29,1981-02-05,1980-10-05,1982-01-05


In [None]:
df['minus year3'] = df['Date1'] - DateOffset(years=3)
df.head()

Unnamed: 0,Date,Temp,Date1,year,month,day,dayofweek,dayname,plus day1,plus day7,minus day7,plus month1,minus month3,plus year1,minus year3
0,1981-01-01,20.7,1981-01-01,1981,1,1,3,Thursday,1981-01-02,1981-01-08,1980-12-25,1981-02-01,1980-10-01,1982-01-01,1978-01-01
1,1981-01-02,17.9,1981-01-02,1981,1,2,4,Friday,1981-01-03,1981-01-09,1980-12-26,1981-02-02,1980-10-02,1982-01-02,1978-01-02
2,1981-01-03,18.8,1981-01-03,1981,1,3,5,Saturday,1981-01-04,1981-01-10,1980-12-27,1981-02-03,1980-10-03,1982-01-03,1978-01-03
3,1981-01-04,14.6,1981-01-04,1981,1,4,6,Sunday,1981-01-05,1981-01-11,1980-12-28,1981-02-04,1980-10-04,1982-01-04,1978-01-04
4,1981-01-05,15.8,1981-01-05,1981,1,5,0,Monday,1981-01-06,1981-01-12,1980-12-29,1981-02-05,1980-10-05,1982-01-05,1978-01-05


## **날짜 구간 데이터 만들기**
`pd.date_range(start=시작일자, end=종료일자, periods=기간수, freq=주기)`

|형식|설명|
|:--|:--|
|D|일별|
|W|주별|
|M|월별 말일|
|MS|월별 시작일|
|A|연도별 말일|
|AS|연도별 시작일|

In [None]:
pd.date_range(start='2020-01-01', periods=30, freq='D')

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10', '2020-01-11', '2020-01-12',
               '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-16',
               '2020-01-17', '2020-01-18', '2020-01-19', '2020-01-20',
               '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24',
               '2020-01-25', '2020-01-26', '2020-01-27', '2020-01-28',
               '2020-01-29', '2020-01-30'],
              dtype='datetime64[ns]', freq='D')

In [None]:
pd.date_range(start='2020-01-01', end='2023-06-30', freq='M')

  pd.date_range(start='2020-01-01', end='2023-06-30', freq='M')


DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31', '2020-06-30', '2020-07-31', '2020-08-31',
               '2020-09-30', '2020-10-31', '2020-11-30', '2020-12-31',
               '2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30',
               '2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
               '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31',
               '2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31',
               '2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30'],
              dtype='datetime64[ns]', freq='ME')

In [None]:
pd.date_range(start='2020-01-01', end='2023-06-30', freq='MS')

DatetimeIndex(['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01',
               '2020-05-01', '2020-06-01', '2020-07-01', '2020-08-01',
               '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01',
               '2021-01-01', '2021-02-01', '2021-03-01', '2021-04-01',
               '2021-05-01', '2021-06-01', '2021-07-01', '2021-08-01',
               '2021-09-01', '2021-10-01', '2021-11-01', '2021-12-01',
               '2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01',
               '2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01',
               '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01',
               '2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',
               '2023-05-01', '2023-06-01'],
              dtype='datetime64[ns]', freq='MS')

In [None]:
pd.date_range(start='2020-01-01', end='2023-06-30', freq='A')

  pd.date_range(start='2020-01-01', end='2023-06-30', freq='A')


DatetimeIndex(['2020-12-31', '2021-12-31', '2022-12-31'], dtype='datetime64[ns]', freq='YE-DEC')

In [None]:
pd.date_range(start='2020-01-01', end='2023-06-30', freq='AS')

  pd.date_range(start='2020-01-01', end='2023-06-30', freq='AS')


DatetimeIndex(['2020-01-01', '2021-01-01', '2022-01-01', '2023-01-01'], dtype='datetime64[ns]', freq='YS-JAN')

## **기간 이동 계산**
`컬럼.rolling().집계함수`

In [None]:
df1 = data.copy()
df1.head()

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8


In [None]:
#7일 이동평균
df1['ma7'] = df1['Temp'].rolling(7).mean()
df1.head(20)

# 1/7일에 1/1 ~ 1/7일까지의 평균
# 1/8일에 1/2 ~ 1/8일까지의 평균

Unnamed: 0,Date,Temp,ma7
0,1981-01-01,20.7,
1,1981-01-02,17.9,
2,1981-01-03,18.8,
3,1981-01-04,14.6,
4,1981-01-05,15.8,
5,1981-01-06,15.8,
6,1981-01-07,15.8,17.057143
7,1981-01-08,17.4,16.585714
8,1981-01-09,21.8,17.142857
9,1981-01-10,20.0,17.314286


In [None]:
df1['ma30'] = df1['Temp'].rolling(30).mean() #현재 날짜를 포함한 과거 30일 동안의 평균을 계산한 값을 df1에 ma30 컬럼으로 추가
df1.head(30)

Unnamed: 0,Date,Temp,ma7,ma30
0,1981-01-01,20.7,,
1,1981-01-02,17.9,,
2,1981-01-03,18.8,,
3,1981-01-04,14.6,,
4,1981-01-05,15.8,,
5,1981-01-06,15.8,,
6,1981-01-07,15.8,17.057143,
7,1981-01-08,17.4,16.585714,
8,1981-01-09,21.8,17.142857,
9,1981-01-10,20.0,17.314286,


- 평균 외에 합계, 최솟값, 최댓값 등 다양한 연산이 가능합니다.

In [28]:
df1['Temp'].rolling(7).sum()

Unnamed: 0,Temp
0,
1,
2,
3,
4,
...,...
3645,91.7
3646,92.2
3647,92.5
3648,94.3


In [29]:
df1['Temp'].rolling(7).min()

Unnamed: 0,Temp
0,
1,
2,
3,
4,
...,...
3645,10.0
3646,10.0
3647,10.0
3648,10.0


In [30]:
df1['Temp'].rolling(7).max()

Unnamed: 0,Temp
0,
1,
2,
3,
4,
...,...
3645,14.6
3646,14.6
3647,14.6
3648,15.7


## **행 이동**
`컬럼.shift(이동할 행의 수)`

In [31]:
df2 = data.copy()

In [32]:
df2.head()

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8


In [33]:
df2['Temp shift1'] = df2['Temp'].shift(1) #행을 하나씩 밑으로 밀기
df2.head()

Unnamed: 0,Date,Temp,Temp shift1
0,1981-01-01,20.7,
1,1981-01-02,17.9,20.7
2,1981-01-03,18.8,17.9
3,1981-01-04,14.6,18.8
4,1981-01-05,15.8,14.6


In [34]:
df2['pct change'] = (df2['Temp shift1'] - df2['Temp'])/df2['Temp']
df2.head()

Unnamed: 0,Date,Temp,Temp shift1,pct change
0,1981-01-01,20.7,,
1,1981-01-02,17.9,20.7,0.156425
2,1981-01-03,18.8,17.9,-0.047872
3,1981-01-04,14.6,18.8,0.287671
4,1981-01-05,15.8,14.6,-0.075949


In [35]:
df2['Temp'].shift(7).head(10) #7번 row부터 값이 존재

Unnamed: 0,Temp
0,
1,
2,
3,
4,
5,
6,
7,20.7
8,17.9
9,18.8


In [36]:
df2['Temp'].shift(-1).head(10) #-1로 한칸씩 올리는거도 가능

Unnamed: 0,Temp
0,17.9
1,18.8
2,14.6
3,15.8
4,15.8
5,15.8
6,17.4
7,21.8
8,20.0
9,16.2
