### corona 데이터에서 전처리작업을 함수화
- "Unnamed: 0" 컬럼을 제거
    - 해당 컬럼이 존재한다면 컬럼을 제거
    - 존재하지 않는다면 아무 행동도 하지 않는다.
- 컬럼의 이름을 변경
- 기준이 되는 컬럼으로 오름차순 정렬
    - 기준이 되는 컬럼의 이름을 매개변수를 이용해서 입력 값 받아온다.
- 인덱스를 초기화 (기존의 인덱스는 제거)
- '일일사망자', '일일확진자' 컬럼을 생성
    - 일일사망자는 shift()함수를 이용하여 생성
    - 일일확진자는 diff()함수를 이용하여 생성
- 결과를 리턴

In [None]:
import pandas as pd

In [17]:
def corona_EDA(_df, _col):
    # _df의 복사본 생성
    result = _df.copy()
    
    # Unnamed: 0 컬럼이 존재하는가?
    if "Unnamed: 0" in result.columns:
        result.drop('Unnamed: 0', axis=1, inplace = True)
    
    # 컬럼의 이름을 변경
    result.columns = ['등록일시', '총사망자', '총확진자', '게시글번호', '기준일', '기준시간', '수정일시', '누적의심자', '누적확진율']
    
    # 기준이 되는 컬럼(_col)을 기준으로 오름차순 정렬
    result.sort_values(_col, inplace=True)

    # 인덱스를 초기화하고 기존의 인덱스는 제거
    result.reset_index(drop = True, inplace = True)
    
    # 파생변수 생성
    # 일일사망자 -> shift()
    result['일일사망자'] = (result['총사망자'] - result['총사망자'].shift()).fillna(0)

    # 일일확진자 -> diff()
    result['일일확진자'] = result['총확진자'].diff().fillna(0)
    
    return result

In [18]:
df = pd.read_csv("../data/csv/corona.csv")

In [19]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,createDt,deathCnt,decideCnt,seq,stateDt,stateTime,updateDt,accExamCnt,accDefRate
0,0,2022-06-08 09:09:05.982,24305,18188200,904,20220608,00:00,,,


In [36]:
corona_EDA(df, '등록일시')

Unnamed: 0,등록일시,총사망자,총확진자,게시글번호,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일확진자
0,2020-03-10 00:00:00.000,54,7513,51,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0.0,0.0
1,2020-03-11 00:00:00.000,60,7755,52,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,6.0,242.0
2,2020-03-12 00:00:00.000,66,7869,53,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,6.0,114.0
3,2020-03-13 00:00:00.000,67,7979,54,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,1.0,110.0
4,2020-03-14 00:00:00.000,72,8086,55,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318000,5.0,107.0
...,...,...,...,...,...,...,...,...,...,...,...
815,2022-06-04 08:56:49.219,24238,18153814,900,20220604,00:00,2022-06-08 09:11:26.303,,,9.0,12039.0
816,2022-06-05 08:53:19.426,24258,18163648,901,20220605,00:00,2022-06-08 09:11:04.758,,,20.0,9834.0
817,2022-06-06 09:00:06.734,24279,18168670,902,20220606,00:00,2022-06-08 09:10:50.441,,,21.0,5022.0
818,2022-06-07 09:09:00.897,24299,18174842,903,20220607,00:00,2022-06-08 09:10:36.846,,,20.0,6172.0


In [34]:
df2 = corona_EDA(df, '기준일')

In [35]:
df2.head(1)

Unnamed: 0,등록일시,총사망자,총확진자,게시글번호,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일확진자
0,2020-03-10 00:00:00.000,54,7513,51,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0.0,0.0


In [38]:
corona_EDA(df, '기준일')

Unnamed: 0,등록일시,총사망자,총확진자,게시글번호,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일확진자
0,2020-03-10 00:00:00.000,54,7513,51,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0.0,0.0
1,2020-03-11 00:00:00.000,60,7755,52,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,6.0,242.0
2,2020-03-12 00:00:00.000,66,7869,53,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,6.0,114.0
3,2020-03-13 00:00:00.000,67,7979,54,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,1.0,110.0
4,2020-03-14 00:00:00.000,72,8086,55,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318000,5.0,107.0
...,...,...,...,...,...,...,...,...,...,...,...
815,2022-06-04 08:56:49.219,24238,18153814,900,20220604,00:00,2022-06-08 09:11:26.303,,,9.0,12039.0
816,2022-06-05 08:53:19.426,24258,18163648,901,20220605,00:00,2022-06-08 09:11:04.758,,,20.0,9834.0
817,2022-06-06 09:00:06.734,24279,18168670,902,20220606,00:00,2022-06-08 09:10:50.441,,,21.0,5022.0
818,2022-06-07 09:09:00.897,24299,18174842,903,20220607,00:00,2022-06-08 09:10:36.846,,,20.0,6172.0


In [48]:
### EDA가 끝난 데이터프레임을 csv로 저장
df2.to_csv('corona_EDA.csv', index=False, encoding='CP949')

In [41]:
df.describe()

Unnamed: 0.1,Unnamed: 0,deathCnt,decideCnt,seq,stateDt,accExamCnt,accDefRate
count,820.0,820.0,820.0,820.0,820.0,692.0,623.0
mean,409.5,3927.832927,1913097.0,472.393902,20208970.0,7545057.0,1.556435
std,236.857904,6164.310693,4857030.0,249.701095,7118.265,6172020.0,0.543014
min,0.0,54.0,7513.0,51.0,20200310.0,210144.0,0.902205
25%,204.75,415.75,23935.25,256.75,20201000.0,1934309.0,1.078089
50%,409.5,1812.5,118564.0,461.5,20210420.0,6368310.0,1.416159
75%,614.25,3120.5,397991.5,698.25,20211120.0,12168900.0,1.816009
max,819.0,24305.0,18188200.0,904.0,20220610.0,21518070.0,3.919308


In [42]:
df2.describe()

Unnamed: 0,총사망자,총확진자,게시글번호,기준일,누적의심자,누적확진율,일일사망자,일일확진자
count,820.0,820.0,820.0,820.0,692.0,623.0,820.0,820.0
mean,3927.832927,1913097.0,472.393902,20208970.0,7545057.0,1.556435,29.57439,22171.569512
std,6164.310693,4857030.0,249.701095,7118.265,6172020.0,0.543014,68.530435,71013.844067
min,54.0,7513.0,51.0,20200310.0,210144.0,0.902205,0.0,0.0
25%,415.75,23935.25,256.75,20201000.0,1934309.0,1.078089,2.0,125.0
50%,1812.5,118564.0,461.5,20210420.0,6368310.0,1.416159,5.0,634.0
75%,3120.5,397991.5,698.25,20211120.0,12168900.0,1.816009,20.0,2782.75
max,24305.0,18188200.0,904.0,20220610.0,21518070.0,3.919308,469.0,621204.0


In [46]:
df2.to_excel('corona.xlsx')

In [45]:
!pip install openpyxl

Collecting openpyxl
  Obtaining dependency information for openpyxl from https://files.pythonhosted.org/packages/6a/94/a59521de836ef0da54aaf50da6c4da8fb4072fb3053fa71f052fd9399e7a/openpyxl-3.1.2-py2.py3-none-any.whl.metadata
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
   ---------------------------------------- 0.0/250.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/250.0 kB ? eta -:--:--
   -------------- ------------------------- 92.2/250.0 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 250.0/250.0 kB 3.1 MB/s eta 0:00:00
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
