# Merge All Datasets

In [2]:
import pandas as pd
import numpy as np

### 1. Load Data

- csv를 불러올 때, parse_dates을 적용하면 날짜 데이터를 파싱하고, 각 컬럼으로 분리, 할당할 수 있다.(아래 참조)
- 날짜 데이터를 포함한 dataset (train, weather)데이터에는 모두 적용해서, groupby와 pivot에 유용하게 사용하자.

In [1]:
# train as "sales" data
sales = pd.read_csv('../data/train.csv', parse_dates=['date'])
print(sales.shape)
sales.head()

(4617600, 4)


Unnamed: 0,date,store_nbr,item_nbr,units
0,2012-01-01,1,1,0
1,2012-01-01,1,2,0
2,2012-01-01,1,3,0
3,2012-01-01,1,4,0
4,2012-01-01,1,5,0


In [2]:
# Keys : Key(station_nbr, store_nbr)
keys = pd.read_csv('../data/key.csv')
print(keys.shape)
keys.head()

(45, 2)


Unnamed: 0,store_nbr,station_nbr
0,1,1
1,2,14
2,3,7
3,4,9
4,5,12


In [3]:
# Weather Log
weather = pd.read_csv('../data/weather.csv',parse_dates=['date'])
print(weather.shape)
weather.head()

(20517, 20)


Unnamed: 0,station_nbr,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
0,1,2012-01-01,52,31,42,M,36,40,23,0,-,-,RA FZFG BR,M,0.05,29.78,29.92,3.6,20,4.6
1,2,2012-01-01,48,33,41,16,37,39,24,0,0716,1626,RA,0.0,0.07,28.82,29.91,9.1,23,11.3
2,3,2012-01-01,55,34,45,9,24,36,20,0,0735,1720,,0.0,0.0,29.77,30.47,9.9,31,10.0
3,4,2012-01-01,63,47,55,4,28,43,10,0,0728,1742,,0.0,0.0,29.79,30.48,8.0,35,8.2
4,6,2012-01-01,63,34,49,0,31,43,16,0,0727,1742,,0.0,0.0,29.95,30.47,14.0,36,13.8


### 2. Merge Data

In [4]:
df_1 = pd.merge(weather,keys)
print(df_1.shape)
df_1.head()

(46392, 21)


Unnamed: 0,station_nbr,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,sunset,codesum,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,store_nbr
0,1,2012-01-01,52,31,42,M,36,40,23,0,...,-,RA FZFG BR,M,0.05,29.78,29.92,3.6,20,4.6,1
1,1,2012-01-02,50,31,41,M,26,35,24,0,...,-,,M,0.01,29.44,29.62,9.8,24,10.3,1
2,1,2012-01-03,32,11,22,M,4,18,43,0,...,-,,M,0.0,29.67,29.87,10.8,31,11.6,1
3,1,2012-01-04,28,9,19,M,-1,14,46,0,...,-,,M,0.0,29.86,30.03,6.3,27,8.3,1
4,1,2012-01-05,38,25,32,M,13,25,33,0,...,-,,M,0.0,29.67,29.84,6.9,25,7.8,1


In [5]:
df_1 = pd.merge(df_1, sales)
print(df_1.shape)
df_1.tail()

(4617600, 23)


Unnamed: 0,station_nbr,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,store_nbr,item_nbr,units
4617595,5,2014-10-31,M,M,M,M,37,M,M,M,...,M,M,M,30.01,10.8,5,M,35,107,0
4617596,5,2014-10-31,M,M,M,M,37,M,M,M,...,M,M,M,30.01,10.8,5,M,35,108,0
4617597,5,2014-10-31,M,M,M,M,37,M,M,M,...,M,M,M,30.01,10.8,5,M,35,109,0
4617598,5,2014-10-31,M,M,M,M,37,M,M,M,...,M,M,M,30.01,10.8,5,M,35,110,0
4617599,5,2014-10-31,M,M,M,M,37,M,M,M,...,M,M,M,30.01,10.8,5,M,35,111,0


### 3. Parse 'Date' and add new columns

In [6]:
dates = df_1['date'].dt
df_1['year'] = df_1['date'].dt.year
df_1['month'] = df_1['date'].dt.month
df_1['day'] = df_1['date'].dt.day
df_1['dayofweek'] = df_1['date'].dt.dayofweek
df_1['weekday'] = (df_1['date'].dt.dayofweek >= 0) & (df_1['date'].dt.dayofweek <= 4)
df_1['weekend'] = (df_1['date'].dt.dayofweek >= 5) & (df_1['date'].dt.dayofweek <= 6)

In [8]:
print(df_1.shape)
df_1.head()

(4617600, 29)


Unnamed: 0,station_nbr,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,avgspeed,store_nbr,item_nbr,units,year,month,day,dayofweek,weekday,weekend
0,1,2012-01-01,52,31,42,M,36,40,23,0,...,4.6,1,1,0,2012,1,1,6,False,True
1,1,2012-01-01,52,31,42,M,36,40,23,0,...,4.6,1,2,0,2012,1,1,6,False,True
2,1,2012-01-01,52,31,42,M,36,40,23,0,...,4.6,1,3,0,2012,1,1,6,False,True
3,1,2012-01-01,52,31,42,M,36,40,23,0,...,4.6,1,4,0,2012,1,1,6,False,True
4,1,2012-01-01,52,31,42,M,36,40,23,0,...,4.6,1,5,0,2012,1,1,6,False,True


### 4. Rearrange Tables

In [17]:
df = pd.DataFrame(df_1, columns=['date', 'year', 'month', 'day','dayofweek', 'weekday', 'weekend', 
                                   'tmax', 'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb', 'heat', 'cool', 
                                   'sunrise', 'sunset', 'codesum', 'snowfall', 'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir', 
                                   'avgspeed', 'station_nbr', 'store_nbr', 'item_nbr', 'units'])

In [18]:
print(df.shape)
df.head()

(4617600, 29)


Unnamed: 0,date,year,month,day,dayofweek,weekday,weekend,tmax,tmin,tavg,...,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,station_nbr,store_nbr,item_nbr,units
0,2012-01-01,2012,1,1,6,False,True,52,31,42,...,0.05,29.78,29.92,3.6,20,4.6,1,1,1,0
1,2012-01-01,2012,1,1,6,False,True,52,31,42,...,0.05,29.78,29.92,3.6,20,4.6,1,1,2,0
2,2012-01-01,2012,1,1,6,False,True,52,31,42,...,0.05,29.78,29.92,3.6,20,4.6,1,1,3,0
3,2012-01-01,2012,1,1,6,False,True,52,31,42,...,0.05,29.78,29.92,3.6,20,4.6,1,1,4,0
4,2012-01-01,2012,1,1,6,False,True,52,31,42,...,0.05,29.78,29.92,3.6,20,4.6,1,1,5,0


### 5. Save to csv for future reference

In [20]:
df.to_csv('full_data.csv', index=False)

### 6. Action plan

Crawling from US official holiday, and apply to dataset  
https://www.officeholidays.com/countries/usa/2012.php