# 데이터셋 확인

## 0. 필요 라이브러리 import

In [19]:
#필요한 패키지를 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

#jupyter에서 matplotlib을 사용해 만든 graph를 화면에 표시하기 위해 필요
%matplotlib inline 

#아래는 한글을 사용할 때 깨지는 문제에 대한 해결
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

#그래프의 축 등에서 음수를 표시할 때 minus sign이 깨지는 것 해결
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False

plt.rcParams["figure.figsize"] = (12, 12)


## 1. 데이터 불러오기

* 2019년 7월 17일 ~ 2020년 09월 30일까지의 서울, 경기도 업종별 배달 주문건수 및 기상, 환경 코로나 데이터

In [20]:
raw_data = pd.read_csv('data/최종데이터/변수추가최종.csv', encoding='utf-8')
raw_data = raw_data.iloc[:, 1:]
raw_data

Unnamed: 0,광역시도명,날짜,시간대별 시간,업종명,주문건수,기온,강수량,풍속,습도,일조,...,CO,O3,NO2,PM10,PM25,확진자수,계절,요일,기념일,공휴일
0,경기도,2019-07-17,0,기타,14,22.86,,0.54,91.2,,...,0.628,0.025,0.027,79.512,54.272,,여름,수,0,0
1,경기도,2019-07-17,0,돈까스/일식,3,22.86,,0.54,91.2,,...,0.628,0.025,0.027,79.512,54.272,,여름,수,0,0
2,경기도,2019-07-17,0,분식,1,22.86,,0.54,91.2,,...,0.628,0.025,0.027,79.512,54.272,,여름,수,0,0
3,경기도,2019-07-17,0,심부름,1,22.86,,0.54,91.2,,...,0.628,0.025,0.027,79.512,54.272,,여름,수,0,0
4,경기도,2019-07-17,0,야식,8,22.86,,0.54,91.2,,...,0.628,0.025,0.027,79.512,54.272,,여름,수,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193857,서울,2020-09-30,23,치킨,62,17.10,,2.20,92.0,,...,0.396,0.017,0.016,17.400,11.880,30.0,가을,수,0,1
193858,서울,2020-09-30,23,카페/디저트,1,17.10,,2.20,92.0,,...,0.396,0.017,0.016,17.400,11.880,30.0,가을,수,0,1
193859,서울,2020-09-30,23,패스트푸드,18,17.10,,2.20,92.0,,...,0.396,0.017,0.016,17.400,11.880,30.0,가을,수,0,1
193860,서울,2020-09-30,23,한식,1,17.10,,2.20,92.0,,...,0.396,0.017,0.016,17.400,11.880,30.0,가을,수,0,1


## 2. 데이터셋 확인

### 1) NULL 값 확인

In [21]:
raw_data.isnull().sum()

광역시도명           0
날짜              0
시간대별 시간         0
업종명             0
주문건수            0
기온             10
강수량        160663
풍속             87
습도              0
일조          75784
적설         192163
운량            369
SO2             0
CO              0
O3              0
NO2             0
PM10            0
PM25            0
확진자수        94634
계절              0
요일              0
기념일             0
공휴일             0
dtype: int64

### 2) 결측치 처리
#### 2-1) 확진자수

* 모두 0으로 처리(코로나 이전 및 확진자 없는 날짜이므로)

In [22]:
raw_data['확진자수'] = raw_data['확진자수'].fillna(0)

#### 2-2) 강수/적설 결합
* 강수/적설이 있는 날은 1, 나머지 0인 '눈비' 컬럼 생성

In [23]:
raw_data['강수량'] = raw_data['강수량'].fillna(0)
raw_data['적설'] = raw_data['적설'].fillna(0)

In [24]:
## 눈비 컬럼 생성

raw_data['눈비'] = 0

raw_data.loc[(raw_data['강수량'] > 0) | (raw_data['적설'] > 0), '눈비'] = 1 

#### 2-3) 기온

#### 2-4) 풍속

#### 2-5) 운량

* 선형적인 흐름을 반영하여 대치

In [25]:
sorted_data = raw_data.sort_values(by=['광역시도명', '업종명', '날짜', '시간대별 시간']).reset_index(drop=True)


pred_data = sorted_data.interpolate(method='linear')

In [26]:
sorted_data.loc[sorted_data['풍속'].isnull()]

Unnamed: 0,광역시도명,날짜,시간대별 시간,업종명,주문건수,기온,강수량,풍속,습도,일조,...,O3,NO2,PM10,PM25,확진자수,계절,요일,기념일,공휴일,눈비
318,경기도,2019-08-03,1,기타,8,25.26,0.0,,95.2,,...,0.006,0.020,22.833,10.439,0.0,여름,토,0,0,0
9738,경기도,2019-08-03,1,돈까스/일식,7,25.26,0.0,,95.2,,...,0.006,0.020,22.833,10.439,0.0,여름,토,0,0,0
17779,경기도,2019-08-03,1,분식,2,25.26,0.0,,95.2,,...,0.006,0.020,22.833,10.439,0.0,여름,토,0,0,0
26497,경기도,2019-08-03,1,심부름,2,25.26,0.0,,95.2,,...,0.006,0.020,22.833,10.439,0.0,여름,토,0,0,0
30169,경기도,2019-08-03,1,야식,13,25.26,0.0,,95.2,,...,0.006,0.020,22.833,10.439,0.0,여름,토,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190340,서울,2019-12-12,21,회,8,-1.20,0.0,,47.0,,...,0.008,0.036,21.522,13.792,0.0,겨울,목,0,0,0
190341,서울,2019-12-12,22,회,12,-1.20,0.0,,52.0,,...,0.007,0.039,20.913,13.917,0.0,겨울,목,0,0,0
190371,서울,2019-12-15,15,회,6,11.00,0.0,,36.0,1.0,...,0.020,0.028,29.600,19.080,0.0,겨울,일,1,0,0
190372,서울,2019-12-15,16,회,13,10.50,0.0,,41.0,0.3,...,0.018,0.031,28.600,18.640,0.0,겨울,일,1,0,0


In [27]:
sorted_data.loc[(sorted_data['업종명']=='치킨')&(sorted_data['날짜']=='2019-09-29')&(sorted_data['광역시도명']=='서울')].iloc[5:15, :13]

Unnamed: 0,광역시도명,날짜,시간대별 시간,업종명,주문건수,기온,강수량,풍속,습도,일조,적설,운량,SO2
159141,서울,2019-09-29,5,치킨,2,18.6,0.0,1.1,83.0,,0.0,0.0,0.003
159142,서울,2019-09-29,6,치킨,3,18.1,0.0,1.1,83.0,,0.0,0.0,0.003
159143,서울,2019-09-29,7,치킨,1,17.9,0.0,,82.0,0.1,0.0,0.0,0.003
159144,서울,2019-09-29,9,치킨,4,20.8,0.0,0.6,64.0,1.0,0.0,2.0,0.004
159145,서울,2019-09-29,10,치킨,7,23.5,0.0,1.8,57.0,1.0,0.0,3.0,0.004
159146,서울,2019-09-29,11,치킨,45,24.7,0.0,,46.0,1.0,0.0,0.0,0.004
159147,서울,2019-09-29,12,치킨,81,26.3,0.0,3.2,40.0,1.0,0.0,0.0,0.004
159148,서울,2019-09-29,13,치킨,97,27.3,0.0,3.1,31.0,1.0,0.0,0.0,0.004
159149,서울,2019-09-29,14,치킨,85,27.9,0.0,3.0,31.0,1.0,0.0,0.0,0.004
159150,서울,2019-09-29,15,치킨,81,28.5,0.0,2.7,34.0,1.0,0.0,0.0,0.004


In [28]:
pred_data.loc[(pred_data['업종명']=='치킨')&(pred_data['날짜']=='2019-09-29')&(pred_data['광역시도명']=='서울')].iloc[5:15, :13]

Unnamed: 0,광역시도명,날짜,시간대별 시간,업종명,주문건수,기온,강수량,풍속,습도,일조,적설,운량,SO2
159141,서울,2019-09-29,5,치킨,2,18.6,0.0,1.1,83.0,0.083333,0.0,0.0,0.003
159142,서울,2019-09-29,6,치킨,3,18.1,0.0,1.1,83.0,0.091667,0.0,0.0,0.003
159143,서울,2019-09-29,7,치킨,1,17.9,0.0,0.85,82.0,0.1,0.0,0.0,0.003
159144,서울,2019-09-29,9,치킨,4,20.8,0.0,0.6,64.0,1.0,0.0,2.0,0.004
159145,서울,2019-09-29,10,치킨,7,23.5,0.0,1.8,57.0,1.0,0.0,3.0,0.004
159146,서울,2019-09-29,11,치킨,45,24.7,0.0,2.5,46.0,1.0,0.0,0.0,0.004
159147,서울,2019-09-29,12,치킨,81,26.3,0.0,3.2,40.0,1.0,0.0,0.0,0.004
159148,서울,2019-09-29,13,치킨,97,27.3,0.0,3.1,31.0,1.0,0.0,0.0,0.004
159149,서울,2019-09-29,14,치킨,85,27.9,0.0,3.0,31.0,1.0,0.0,0.0,0.004
159150,서울,2019-09-29,15,치킨,81,28.5,0.0,2.7,34.0,1.0,0.0,0.0,0.004


In [29]:
pred_data.isnull().sum()

광역시도명      0
날짜         0
시간대별 시간    0
업종명        0
주문건수       0
기온         0
강수량        0
풍속         0
습도         0
일조         2
적설         0
운량         0
SO2        0
CO         0
O3         0
NO2        0
PM10       0
PM25       0
확진자수       0
계절         0
요일         0
기념일        0
공휴일        0
눈비         0
dtype: int64

#### 2-6) 일조
* 일조시간에 대한 데이터는 결측값이 많아 지우기로 하였음

In [30]:
pred_data.drop('일조', axis=1, inplace=True)

In [31]:
pred_data.isnull().sum()

광역시도명      0
날짜         0
시간대별 시간    0
업종명        0
주문건수       0
기온         0
강수량        0
풍속         0
습도         0
적설         0
운량         0
SO2        0
CO         0
O3         0
NO2        0
PM10       0
PM25       0
확진자수       0
계절         0
요일         0
기념일        0
공휴일        0
눈비         0
dtype: int64


### 3) 파생변수 추가

운량에 따라   
0~2 -> 맑음(1) 3 ~ 5 -> 구름 조금(2) 6 ~ 8 -> 구름 많음(3) 9 ~ 10 -> 흐림(4)

In [32]:
final_data = pred_data.copy()

final_data['날씨'] = 4
final_data.loc[(final_data['운량'] >= 0) & (final_data['운량'] <=2), '날씨'] = 1
final_data.loc[(final_data['운량'] >= 3) & (final_data['운량'] <=5), '날씨'] = 2
final_data.loc[(final_data['운량'] >= 6) & (final_data['운량'] <=8), '날씨'] = 3

In [33]:
final_data.columns

Index(['광역시도명', '날짜', '시간대별 시간', '업종명', '주문건수', '기온', '강수량', '풍속', '습도', '적설',
       '운량', 'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25', '확진자수', '계절', '요일',
       '기념일', '공휴일', '눈비', '날씨'],
      dtype='object')

In [34]:
final_data = final_data[['광역시도명', '날짜', '요일', '시간대별 시간', '업종명','계절', '공휴일', '기념일', '기온', '풍속', '습도', 'PM10', '운량', '날씨', '눈비', '강수량', '확진자수', 'SO2', 'CO', 'O3', 'NO2', 'PM25', '적설', '주문건수']]
final_data

Unnamed: 0,광역시도명,날짜,요일,시간대별 시간,업종명,계절,공휴일,기념일,기온,풍속,...,눈비,강수량,확진자수,SO2,CO,O3,NO2,PM25,적설,주문건수
0,경기도,2019-07-17,수,0,기타,여름,0,0,22.86,0.54,...,0,0.0,0.0,0.004,0.628,0.025,0.027,54.272,0.0,14
1,경기도,2019-07-17,수,1,기타,여름,0,0,22.48,0.52,...,0,0.0,0.0,0.003,0.543,0.013,0.023,39.325,0.0,2
2,경기도,2019-07-17,수,10,기타,여름,0,0,26.36,1.24,...,0,0.0,0.0,0.003,0.547,0.033,0.020,38.805,0.0,3
3,경기도,2019-07-17,수,11,기타,여름,0,0,26.80,1.34,...,0,0.0,0.0,0.003,0.544,0.046,0.018,39.076,0.0,27
4,경기도,2019-07-17,수,12,기타,여름,0,0,27.26,1.46,...,0,0.0,0.0,0.003,0.526,0.057,0.018,40.727,0.0,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193857,서울,2020-09-30,수,19,회,가을,1,0,19.00,2.70,...,1,29.5,30.0,0.003,0.428,0.034,0.018,21.120,0.0,7
193858,서울,2020-09-30,수,20,회,가을,1,0,18.40,1.90,...,1,4.5,30.0,0.003,0.420,0.032,0.016,20.040,0.0,11
193859,서울,2020-09-30,수,21,회,가을,1,0,18.20,0.70,...,0,0.0,30.0,0.003,0.444,0.023,0.018,16.680,0.0,3
193860,서울,2020-09-30,수,22,회,가을,1,0,17.60,1.50,...,0,0.0,30.0,0.002,0.424,0.017,0.019,14.320,0.0,7


In [35]:
final_data.to_csv('data/최종데이터/최종_열삭제전.csv', encoding='utf-8')