# 날짜 다루기

In [4]:
import numpy as np
import pandas as pd
from datetime import datetime

covidcases = pd.read_csv('data/covidcases720.csv')
nls97 = pd.read_csv('data/nls97c.csv')
nls97.set_index('personid', inplace=True)

### 생월, 생년값 체크

In [5]:
nls97[['birthmonth', 'birthyear']].isnull().sum()

birthmonth    1
birthyear     0
dtype: int64

In [7]:
nls97.birthmonth.value_counts().sort_index()

1.0     815
2.0     693
3.0     760
4.0     659
5.0     689
6.0     720
7.0     762
8.0     782
9.0     839
10.0    765
11.0    763
12.0    736
Name: birthmonth, dtype: int64

In [8]:
nls97.birthyear.value_counts().sort_index()

1980    1691
1981    1874
1982    1841
1983    1807
1984    1771
Name: birthyear, dtype: int64

### fillna 메서드로 누락된 생월 채우기(1건)

In [9]:
nls97.birthmonth.fillna(int(nls97.birthmonth.mean()), inplace=True)

In [10]:
nls97.birthmonth.value_counts().sort_index()

1.0     815
2.0     693
3.0     760
4.0     659
5.0     689
6.0     721
7.0     762
8.0     782
9.0     839
10.0    765
11.0    763
12.0    736
Name: birthmonth, dtype: int64

### 생년 및 생월 값과 날짜 정숫값을 더해 datetime 열 생성
- 📍판다스 to_datetime() 함수에 딕셔너리 전달
- 딕셔너리에는 year, month, day 키가 필요함

In [11]:
nls97['birthdate'] = pd.to_datetime(dict(year=nls97.birthyear, month=nls97.birthmonth, day=15))

In [12]:
nls97[['birthmonth', 'birthyear', 'birthdate']].head()

Unnamed: 0_level_0,birthmonth,birthyear,birthdate
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,5.0,1980,1980-05-15
100139,9.0,1983,1983-09-15
100284,11.0,1984,1984-11-15
100292,4.0,1982,1982-04-15
100583,6.0,1980,1980-06-15


In [13]:
nls97[['birthmonth', 'birthyear', 'birthdate']].isnull().sum()

birthmonth    0
birthyear     0
birthdate     0
dtype: int64

### datetime 열을 사용해 연령값을 계산
- 시작 날짜와 끝날짜를 받아서 나이를 계산하는 함수 정의

In [14]:
def calcage(startdate, enddate):
    age = enddate.year - startdate.year
    if (enddate.month < startdate.month or 
        (enddate.month == startdate.month and enddate.day < startdate.day)):
        age = age -1
    return age    

In [17]:
rundate = pd.to_datetime('2020-07-20')
rundate

Timestamp('2020-07-20 00:00:00')

In [39]:
# 함수 적용📍
nls97['age'] = nls97.apply(lambda x: calcage(x.birthdate, rundate), axis=1)

In [19]:
nls97.loc[100061:100583, ['age', 'birthdate']]

Unnamed: 0_level_0,age,birthdate
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,40,1980-05-15
100139,36,1983-09-15
100284,35,1984-11-15
100292,38,1982-04-15
100583,40,1980-06-15


### 문자열을 날짜 열로 변환

In [20]:
covidcases.iloc[:, 0:6].dtypes

iso_code        object
continent       object
location        object
casedate        object
total_cases    float64
new_cases      float64
dtype: object

In [21]:
covidcases.iloc[:, 0:6].sample(2, random_state=1).T

Unnamed: 0,13482,2445
iso_code,IMN,BRB
continent,Europe,North America
location,Isle of Man,Barbados
casedate,2020-06-20,2020-04-28
total_cases,336.0,80.0
new_cases,0.0,1.0


In [22]:
covidcases['casedate'] = pd.to_datetime(covidcases.casedate, format='%Y-%m-%d')

In [23]:
covidcases.iloc[:, 0:6].dtypes

iso_code               object
continent              object
location               object
casedate       datetime64[ns]
total_cases           float64
new_cases             float64
dtype: object

In [24]:
# datetime 열의 기술통계
covidcases.casedate.describe()

  covidcases.casedate.describe()


count                   29529
unique                    195
top       2020-05-17 00:00:00
freq                      209
first     2019-12-31 00:00:00
last      2020-07-12 00:00:00
Name: casedate, dtype: object

### 날짜 간격을 포착하는 timedelta 객체 생성
- 각 날짜에 대해, 각국의 최초 사례로부터 경과한 날짜 수 계산
    1. 각국의 최초 사례를 나타내는 데이터프레임 생성 -> covid와 병합
    2. 각 날짜에 대해 날짜 수 계싼

In [31]:
firstcase = covidcases.loc[covidcases.new_cases > 0, ['location', 'casedate']].sort_values(['location', 'casedate']).\
            drop_duplicates(['location'], keep='first').rename(columns={'casedate':'firstcasedate'})

In [32]:
covidcases = pd.merge(covidcases, firstcase, left_on=['location'], right_on=['location'], how='left')

In [33]:
covidcases['dayssincefirstcase'] = covidcases.casedate - covidcases.firstcasedate

In [38]:
covidcases.dayssincefirstcase.describe()

count                         29529
mean     56 days 00:15:12.892410850
std      47 days 00:35:41.813685246
min              -62 days +00:00:00
25%                21 days 00:00:00
50%                57 days 00:00:00
75%                92 days 00:00:00
max               194 days 00:00:00
Name: dayssincefirstcase, dtype: object

> 최초 확진자가 발생하기 62일 전부터 보고를 시작한 나라가 있음을 볼 수 있다