# 최종 데이터 파일 만들기

* 분석에 필요한 최종 데이터 프레임 만들기

In [2]:
import pandas as pd
import numpy as np

## 1) 업종-지역별 배달 주문 건수

In [3]:
## 데이터 불러오기 및 컬럼 설정

raw_data = pd.read_csv('./data/업종-지역별 배달 주문건수.csv', encoding='utf-8', header=None)
raw_data.columns = ['날짜', '시간대별 시간', '업종명', '광역시도명', '시군구명', '주문건수']
raw_data

Unnamed: 0,날짜,시간대별 시간,업종명,광역시도명,시군구명,주문건수
0,2019-07-17,0,도시락,경기도,의정부시,1
1,2019-07-17,0,돈까스/일식,경기도,의정부시,3
2,2019-07-17,0,돈까스/일식,충청북도,제천시,1
3,2019-07-17,0,배달전문업체,경기도,고양시 일산동구,8
4,2019-07-17,0,배달전문업체,경기도,의정부시,3
...,...,...,...,...,...,...
1957310,2020-09-30,23,회,경기도,화성시,2
1957311,2020-09-30,23,회,서울특별시,도봉구,1
1957312,2020-09-30,23,회,서울특별시,은평구,7
1957313,2020-09-30,23,회,전라북도,군산시,1


In [4]:
## 서울특별시 / 경기도만 추출

sg_data = raw_data.loc[(raw_data['광역시도명'] == '경기도') | (raw_data['광역시도명'] == '서울특별시')].reset_index(drop=True)

In [5]:
## 종속변수 완성
grouped_sg = sg_data.groupby(['광역시도명', '날짜', '업종명']).sum()[['주문건수']].reset_index()



In [6]:
grouped_sg.loc[grouped_sg['광역시도명']=='서울특별시', '광역시도명'] = '서울'

In [7]:
grouped_sg

Unnamed: 0,광역시도명,날짜,업종명,주문건수
0,경기도,2019-07-17,도시락,96
1,경기도,2019-07-17,돈까스/일식,198
2,경기도,2019-07-17,배달전문업체,108
3,경기도,2019-07-17,분식,554
4,경기도,2019-07-17,심부름,4
...,...,...,...,...
13816,서울,2020-09-30,카페/디저트,167
13817,서울,2020-09-30,패스트푸드,916
13818,서울,2020-09-30,피자,130
13819,서울,2020-09-30,한식,282


## 2) 서울시 기상데이터

In [8]:
weather = pd.read_csv('./data/가공/기상데이터.csv', encoding='utf-8', index_col=0)
weather

Unnamed: 0,지점명,일시,기온(°C),풍속(m/s),습도(%),증기압(hPa),지면온도(°C)
0,경기도,2019-07-17,24.986,0.948,82.192,25.548,27.070
1,경기도,2019-07-18,26.353,1.113,74.533,24.797,29.028
2,경기도,2019-07-19,27.125,1.610,75.275,26.231,29.912
3,경기도,2019-07-20,27.493,2.658,74.483,27.056,27.423
4,경기도,2019-07-21,25.050,1.708,91.883,29.000,26.243
...,...,...,...,...,...,...,...
879,서울,2020-09-26,19.342,2.475,62.333,13.688,20.462
880,서울,2020-09-27,20.208,2.096,62.917,14.563,21.746
881,서울,2020-09-28,20.592,1.879,65.667,15.479,22.046
882,서울,2020-09-29,19.646,1.883,67.667,15.125,21.188


In [9]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 884 entries, 0 to 883
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   지점명       884 non-null    object 
 1   일시        884 non-null    object 
 2   기온(°C)    884 non-null    float64
 3   풍속(m/s)   884 non-null    float64
 4   습도(%)     884 non-null    float64
 5   증기압(hPa)  884 non-null    float64
 6   지면온도(°C)  884 non-null    float64
dtypes: float64(5), object(2)
memory usage: 55.2+ KB


In [10]:
sg_weather = pd.merge(grouped_sg, weather, left_on=['광역시도명', '날짜'], right_on=['지점명', '일시'], how='inner')

sg_weather.drop(['지점명', '일시'], axis=1, inplace=True)

In [11]:
sg_weather

Unnamed: 0,광역시도명,날짜,업종명,주문건수,기온(°C),풍속(m/s),습도(%),증기압(hPa),지면온도(°C)
0,경기도,2019-07-17,도시락,96,24.986,0.948,82.192,25.548,27.07
1,경기도,2019-07-17,돈까스/일식,198,24.986,0.948,82.192,25.548,27.07
2,경기도,2019-07-17,배달전문업체,108,24.986,0.948,82.192,25.548,27.07
3,경기도,2019-07-17,분식,554,24.986,0.948,82.192,25.548,27.07
4,경기도,2019-07-17,심부름,4,24.986,0.948,82.192,25.548,27.07
...,...,...,...,...,...,...,...,...,...
13816,서울,2020-09-30,카페/디저트,167,18.800,1.700,77.000,16.600,17.50
13817,서울,2020-09-30,패스트푸드,916,18.800,1.700,77.000,16.600,17.50
13818,서울,2020-09-30,피자,130,18.800,1.700,77.000,16.600,17.50
13819,서울,2020-09-30,한식,282,18.800,1.700,77.000,16.600,17.50


## 3) 서울시 미세먼지

In [12]:
pollution = pd.read_csv('./data/가공/지역별미세먼지데이터.csv', encoding='utf-8').iloc[:, 1:]
pollution


Unnamed: 0,측정날짜,행정구역,SO2,CO,O3,NO2,PM10,PM25
0,20190701,강원,0.002,0.405,0.052,0.008,40.010,26.654
1,20190701,충북,0.002,0.339,0.048,0.009,35.746,27.983
2,20190701,충남,0.004,0.552,0.055,0.009,36.788,26.268
3,20190701,제주,0.002,0.203,0.041,0.009,22.500,12.353
4,20190701,전북,0.003,0.359,0.042,0.008,34.702,22.219
...,...,...,...,...,...,...,...,...
7781,20200930,경기,0.003,0.452,0.027,0.013,34.083,20.592
7782,20200930,강원,0.002,0.395,0.026,0.005,20.206,11.818
7783,20200930,충남,0.004,0.463,0.033,0.009,33.706,21.029
7784,20200930,부산,0.004,0.341,0.033,0.011,17.695,8.480


In [13]:
## 서울, 경기 데이터 추출
sg = pollution[(pollution['행정구역']=='서울')|(pollution['행정구역']=='경기')]

In [14]:
## 측정날짜 형변환 str => datetime
sg['측정날짜'] = pd.to_datetime(sg['측정날짜'].astype('str'), format='%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sg['측정날짜'] = pd.to_datetime(sg['측정날짜'].astype('str'), format='%Y-%m-%d')


In [15]:
sg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 916 entries, 8 to 7781
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   측정날짜    916 non-null    datetime64[ns]
 1   행정구역    916 non-null    object        
 2   SO2     916 non-null    float64       
 3   CO      916 non-null    float64       
 4   O3      916 non-null    float64       
 5   NO2     916 non-null    float64       
 6   PM10    916 non-null    float64       
 7   PM25    916 non-null    float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 64.4+ KB


In [16]:
## 행정구역 값 변경 경기 => 경기도
sg.loc[sg['행정구역']=='경기', '행정구역'] = '경기도'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [17]:
## sg_weather 형변환 str => datetime
sg_weather['날짜'] = pd.to_datetime(sg_weather['날짜'].astype('str'), format='%Y-%m-%d')

In [18]:
sg_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13821 entries, 0 to 13820
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   광역시도명     13821 non-null  object        
 1   날짜        13821 non-null  datetime64[ns]
 2   업종명       13821 non-null  object        
 3   주문건수      13821 non-null  int64         
 4   기온(°C)    13821 non-null  float64       
 5   풍속(m/s)   13821 non-null  float64       
 6   습도(%)     13821 non-null  float64       
 7   증기압(hPa)  13821 non-null  float64       
 8   지면온도(°C)  13821 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(1), object(2)
memory usage: 1.1+ MB


In [19]:
sg_weather

Unnamed: 0,광역시도명,날짜,업종명,주문건수,기온(°C),풍속(m/s),습도(%),증기압(hPa),지면온도(°C)
0,경기도,2019-07-17,도시락,96,24.986,0.948,82.192,25.548,27.07
1,경기도,2019-07-17,돈까스/일식,198,24.986,0.948,82.192,25.548,27.07
2,경기도,2019-07-17,배달전문업체,108,24.986,0.948,82.192,25.548,27.07
3,경기도,2019-07-17,분식,554,24.986,0.948,82.192,25.548,27.07
4,경기도,2019-07-17,심부름,4,24.986,0.948,82.192,25.548,27.07
...,...,...,...,...,...,...,...,...,...
13816,서울,2020-09-30,카페/디저트,167,18.800,1.700,77.000,16.600,17.50
13817,서울,2020-09-30,패스트푸드,916,18.800,1.700,77.000,16.600,17.50
13818,서울,2020-09-30,피자,130,18.800,1.700,77.000,16.600,17.50
13819,서울,2020-09-30,한식,282,18.800,1.700,77.000,16.600,17.50


In [20]:
## 미세먼지데이터, 날씨데이터 병합
sg_dust = pd.merge(sg_weather, sg, left_on=['광역시도명', '날짜'], right_on=['행정구역', '측정날짜'], how='inner')

In [21]:
sg_dust.drop(['행정구역', '측정날짜'], axis=1, inplace=True)

In [23]:
sg_dust

Unnamed: 0,광역시도명,날짜,업종명,주문건수,기온(°C),풍속(m/s),습도(%),증기압(hPa),지면온도(°C),SO2,CO,O3,NO2,PM10,PM25
0,경기도,2019-07-17,도시락,96,24.986,0.948,82.192,25.548,27.07,0.003,0.568,0.038,0.022,63.998,43.787
1,경기도,2019-07-17,돈까스/일식,198,24.986,0.948,82.192,25.548,27.07,0.003,0.568,0.038,0.022,63.998,43.787
2,경기도,2019-07-17,배달전문업체,108,24.986,0.948,82.192,25.548,27.07,0.003,0.568,0.038,0.022,63.998,43.787
3,경기도,2019-07-17,분식,554,24.986,0.948,82.192,25.548,27.07,0.003,0.568,0.038,0.022,63.998,43.787
4,경기도,2019-07-17,심부름,4,24.986,0.948,82.192,25.548,27.07,0.003,0.568,0.038,0.022,63.998,43.787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13816,서울,2020-09-30,카페/디저트,167,18.800,1.700,77.000,16.600,17.50,0.003,0.407,0.025,0.016,29.213,18.445
13817,서울,2020-09-30,패스트푸드,916,18.800,1.700,77.000,16.600,17.50,0.003,0.407,0.025,0.016,29.213,18.445
13818,서울,2020-09-30,피자,130,18.800,1.700,77.000,16.600,17.50,0.003,0.407,0.025,0.016,29.213,18.445
13819,서울,2020-09-30,한식,282,18.800,1.700,77.000,16.600,17.50,0.003,0.407,0.025,0.016,29.213,18.445


In [45]:
corona = pd.read_csv('./data/보건복지부_코로나19_시도발생_현황.csv', encoding='utf-8', index_col = 0)
corona.head()
corona.info()
corona

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3621 entries, 0 to 3620
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   기준일시      3621 non-null   object 
 1   시도명       3621 non-null   object 
 2   전일대비증감수   3621 non-null   int64  
 3   사망자수      3621 non-null   int64  
 4   격리해제수     3519 non-null   float64
 5   격리중환자수    2992 non-null   float64
 6   지역발생수     2992 non-null   float64
 7   해외유입수     2992 non-null   float64
 8   10만명당발생률  3604 non-null   float64
dtypes: float64(5), int64(2), object(2)
memory usage: 282.9+ KB


Unnamed: 0,기준일시,시도명,전일대비증감수,사망자수,격리해제수,격리중환자수,지역발생수,해외유입수,10만명당발생률
0,2020-09-30,제주,0,0,55.0,4.0,0.0,0.0,8.80
1,2020-09-30,경남,0,0,273.0,18.0,0.0,0.0,8.66
2,2020-09-30,경북,6,56,1441.0,54.0,4.0,2.0,58.25
3,2020-09-30,전남,0,2,153.0,14.0,0.0,0.0,9.06
4,2020-09-30,전북,3,0,106.0,22.0,2.0,1.0,7.04
...,...,...,...,...,...,...,...,...,...
3616,2020-03-01,광주,0,0,,,,,
3617,2020-03-01,인천,0,0,,,,,
3618,2020-03-01,대구,469,9,,,,,
3619,2020-03-01,부산,3,0,,,,,


In [46]:
## corona 기준일시 형변환 str => datetime
corona['기준일시'] = pd.to_datetime(corona['기준일시'].astype('str'), format='%Y-%m-%d')

In [47]:
# 시도명 확인
corona['시도명'].unique()

array(['제주', '경남', '경북', '전남', '전북', '충남', '충북', '강원', '경기', '세종', '울산',
       '대전', '광주', '인천', '대구', '부산', '서울'], dtype=object)

In [48]:
## 행정구역 값 변경 경기 => 경기도
corona.loc[corona['시도명']=='경기', '시도명'] = '경기도'

In [49]:
## 코로나데이터, 날씨데이터 병합
data_final = pd.merge(sg_dust, corona, left_on=['광역시도명', '날짜'], right_on=['시도명', '기준일시'], how='left')

In [52]:
data_final

Unnamed: 0,광역시도명,날짜,업종명,주문건수,기온(°C),풍속(m/s),습도(%),증기압(hPa),지면온도(°C),SO2,...,NO2,PM10,PM25,전일대비증감수,사망자수,격리해제수,격리중환자수,지역발생수,해외유입수,10만명당발생률
0,경기도,2019-07-17,도시락,96,24.986,0.948,82.192,25.548,27.07,0.003,...,0.022,63.998,43.787,,,,,,,
1,경기도,2019-07-17,돈까스/일식,198,24.986,0.948,82.192,25.548,27.07,0.003,...,0.022,63.998,43.787,,,,,,,
2,경기도,2019-07-17,배달전문업체,108,24.986,0.948,82.192,25.548,27.07,0.003,...,0.022,63.998,43.787,,,,,,,
3,경기도,2019-07-17,분식,554,24.986,0.948,82.192,25.548,27.07,0.003,...,0.022,63.998,43.787,,,,,,,
4,경기도,2019-07-17,심부름,4,24.986,0.948,82.192,25.548,27.07,0.003,...,0.022,63.998,43.787,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13848,서울,2020-09-30,카페/디저트,167,18.800,1.700,77.000,16.600,17.50,0.003,...,0.016,29.213,18.445,51.0,56.0,4416.0,821.0,51.0,0.0,54.38
13849,서울,2020-09-30,패스트푸드,916,18.800,1.700,77.000,16.600,17.50,0.003,...,0.016,29.213,18.445,51.0,56.0,4416.0,821.0,51.0,0.0,54.38
13850,서울,2020-09-30,피자,130,18.800,1.700,77.000,16.600,17.50,0.003,...,0.016,29.213,18.445,51.0,56.0,4416.0,821.0,51.0,0.0,54.38
13851,서울,2020-09-30,한식,282,18.800,1.700,77.000,16.600,17.50,0.003,...,0.016,29.213,18.445,51.0,56.0,4416.0,821.0,51.0,0.0,54.38


In [51]:
data_final.drop(['기준일시','시도명'], inplace=True, axis=1)

In [55]:
data_final.to_csv('data/최종데이터/data_final.csv', encoding='utf-8')