In [4]:
import pandas as pd
import os
os.listdir('../data/final')

['carspeed.csv',
 'car_speed_36912.csv',
 'evening_test.csv',
 'evening_train.csv',
 'morning_test.csv',
 'morning_train.csv']

In [2]:
carspeed = pd.read_csv('../data/final/carspeed.csv')
carspeed.shape

(7025495, 39)

In [3]:
carspeed = carspeed[~carspeed.days.isin(['토', '일'])]
carspeed.shape

(5864850, 39)

In [5]:
carspeed2 = pd.read_csv('../data/final/car_speed_36912.csv')
carspeed2.shape

(4089876, 31)

# common link

In [6]:
common_link = set(carspeed.link_id).intersection(set(carspeed2.link_id))
len(common_link)

4527

In [7]:
carspeed = carspeed[carspeed.link_id.isin(common_link)]
carspeed2 = carspeed2[carspeed2.link_id.isin(common_link)]

print('shape 1 : ', carspeed.shape)
print('shape 2 : ', carspeed2.shape)

shape 1 :  (5864850, 39)
shape 2 :  (3997528, 31)


# intersection

In [9]:
intersection = pd.read_csv('../data/intersection.csv')
intersection = intersection.melt(id_vars = '도로명')
intersection.columns = ['road_name','direction','inter_num']
intersection.head()

Unnamed: 0,road_name,direction,inter_num
0,4.19로,상행,2.0
1,가락로,상행,4.0
2,가로공원로,상행,4.0
3,가로공원로76길,상행,1.0
4,가마산로,상행,10.0


In [12]:
carspeed2 = pd.merge(carspeed2, intersection, how = 'left', left_on = ['road_name','direction'], right_on = ['road_name','direction'])
carspeed2.shape

(3997528, 32)

# accident new

In [13]:
accident_new = pd.read_csv('../data/accident_new.csv')
accident_new.columns = ['road_name','사망자수','중상자수','경상자수','부상신고자수','victims','severity','accident_count']
accident_new.head(3)

Unnamed: 0,road_name,사망자수,중상자수,경상자수,부상신고자수,victims,severity,accident_count
0,4.19로,0,3,15,1,19,55,13
1,63로,0,0,3,1,4,10,3
2,가락로,0,20,50,15,85,225,70


In [15]:
carspeed2 = pd.merge(carspeed2, accident_new, how = 'left', left_on = ['road_name'], right_on = ['road_name'])
carspeed2.shape

(3997528, 39)

In [16]:
carspeed2 = carspeed2[~carspeed2.days.isin(['토', '일'])]
carspeed2.shape

(2725244, 39)

In [17]:
carspeed2 = carspeed2[carspeed.columns]

In [19]:
carspeed = pd.concat([carspeed, carspeed2], axis = 0)
carspeed.shape

(8590094, 39)

In [23]:
morning = carspeed[(carspeed.time == 6)|(carspeed.time == 7)|(carspeed.time == 8)|(carspeed.time == 9)]
evening = carspeed[(carspeed.time == 17)|(carspeed.time == 18)|(carspeed.time == 19)|(carspeed.time == 20)]

In [24]:
print('morning 평균 속도', morning.value.mean())
print('evening 평균 속도', evening.value.mean())
print('morning shape : ',morning.shape)
print('evening shape : ',evening.shape)

morning 평균 속도 28.751670954960986
evening 평균 속도 21.981790283716343
morning shape :  (4296579, 39)
evening shape :  (4293515, 39)


In [25]:
del carspeed, carspeed2

# tran, test split

In [21]:
from sklearn.model_selection import train_test_split

## morning

In [26]:
y = morning.value
X = morning.drop('value', axis = 1)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 333 )

In [28]:
X_train['y'] = y_train
X_test['y'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
print('train shape : ', X_train.shape)
print('test shape : ', X_test.shape)

train shape :  (3007605, 39)
test shape :  (1288974, 39)


In [30]:
X_train.columns = ['date', 'days', 'road_name', 'link_id', 'start_point', 'end_point',
       'direction', 'road_length', 'road_num', 'road_type', 'urban_suburb',
       'district', 'special', 'vacation', 'time', 'temperature', 'wind',
       'rain', 'snow', 'cctv', 'kids', 'speed_limit', 'entrance', 'signal',
       'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM2.5', 'inter_num', 'd1', 'd2',
       'd3', 'd4', 'd_sum', 'severity', 'accident_count', 'y']
X_test.columns = X_train.columns

In [31]:
X_train.to_csv('../data/final/morning_train.csv', index = False, encoding = 'utf-8')
X_test.to_csv('../data/final/morning_test.csv', index = False, encoding = 'utf-8')

## evening

In [32]:
y = evening.value
X = evening.drop('value', axis = 1)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 333 )

In [34]:
X_train['y'] = y_train
X_test['y'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [35]:
print('train shape : ', X_train.shape)
print('test shape : ', X_test.shape)

train shape :  (3005460, 39)
test shape :  (1288055, 39)


In [36]:
X_train.columns = ['date', 'days', 'road_name', 'link_id', 'start_point', 'end_point',
       'direction', 'road_length', 'road_num', 'road_type', 'urban_suburb',
       'district', 'special', 'vacation', 'time', 'temperature', 'wind',
       'rain', 'snow', 'cctv', 'kids', 'speed_limit', 'entrance', 'signal',
       'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM2.5', 'inter_num', 'd1', 'd2',
       'd3', 'd4', 'd_sum', 'severity', 'accident_count', 'y']
X_test.columns = X_train.columns

In [37]:
X_train.to_csv('../data/final/evening_train.csv', index = False, encoding = 'utf-8')
X_test.to_csv('../data/final/evening_test.csv', index = False, encoding = 'utf-8')