# ToDo
- weatherとunspecified_searchのマージはpahse（train or test)によって読み込むcsvが異なっているので、１つのファイルに統一する
- forで処理している関数が遅いので速度改善（関越の処理が致命的に遅くなる）
- メモリに乗らない場合はchunksize指定して読み込み、要らない列をdropしてから処理をする

In [2]:
import jpholiday
import numpy as np
import datetime as dt
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_columns', None)

In [16]:
# data directory
PROCESSED_DATA_DIR = '../Input_processed_data'
ORI_DATA_DIR = '../Input_original_data'

# IC, 道路情報 csv
IC_CSV = f'{PROCESSED_DATA_DIR}/road_master/ic_merged.csv'
IC_SUB_CSV = f'{PROCESSED_DATA_DIR}/road_master/tateyama_kannetsu_ic.csv'
IC_NET_SUB_CSV = f'{PROCESSED_DATA_DIR}/road_master/tateyama_kannetsu_doronet_sub.csv'

# 天気 csv
WEATHER_CSV = f'{PROCESSED_DATA_DIR}/weather_data/weather_20220401.csv'

# 検索量 csv
SEARCH_COUNT_DIR = f'{PROCESSED_DATA_DIR}/search_count'
SEARCH_COUNT_TATEYAMA_CSV = f'{SEARCH_COUNT_DIR}/search-count_tateyama.csv'
SEARCH_COUNT_KANNETSU_CSV = f'{SEARCH_COUNT_DIR}/search-count_kannetsu.csv'

In [17]:
def fix_holiday(df):
    holi1 = []
    holi2 = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        y, m, d, dow = row[['年', '月', '日', '曜日']]
        if jpholiday.is_holiday(dt.datetime(y, m, d)):
            holi1.append('休')
            holi2.append('休')
            continue
        if dow == '日':
            holi1.append('休')
            holi2.append('休')
        elif dow == '土':
            holi1.append('平')
            holi2.append('休')
        else:
            holi1.append('平')
            holi2.append('平')
    df['平休1'] = holi1
    df['平休2（休日に土曜日含む）'] = holi2
    return df


def add_date(df):
    df['hour'] = df['時刻'].apply(lambda x: int(x.split(':')[0]))
    df['minute'] = df['時刻'].apply(lambda x: int(x.split(':')[1]))
    dates1 = []
    dates2= []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        year, month, day, hour, minute = row[['年', '月', '日', 'hour', 'minute']]
        dt1 = dt.datetime(year, month, day, hour, minute)
        dt2 = dt.datetime(year, month, day)
        dates1.append(dt1)
        dates2.append(dt2)
    df['年月日'] = dates1
    df['年月日2'] = dates2
    return df


def resample(df, sampling_rate='H'):
    tmp = df.groupby(['年月日2','hour','区間名称','方向']).sum().reset_index()['全車']
    df = df.set_index('年月日').resample(sampling_rate).mean().reset_index().merge(df[['年月日','曜日','平休1','平休2（休日に土曜日含む）','方向','hour','区間名称','年月日2']], on=['年月日','hour'])
    df['全車'] = tmp
    return df


def add_ic_info(df):
    ic_df = pd.read_csv(IC_SUB_CSV)
    df['start'] = df['区間名称'].apply(lambda x: x.split()[0])
    df['goal'] = df['区間名称'].apply(lambda x: x.split()[-1])

    df = df.merge(ic_df[['dorapura_name','lat','lng','degree','ic_code']], left_on='start', right_on='dorapura_name', how='left')
    df.rename(columns={'lat':'lat_start', 'lng':'lng_start', 'degree':'degree_start', 'ic_code':'start_code'}, inplace=True)
    df = df.merge(ic_df[['dorapura_name','lat','lng','degree','ic_code']], left_on='goal', right_on='dorapura_name', how='left')
    df.rename(columns={'lat':'lat_goal', 'lng':'lng_goal', 'degree':'degree_goal', 'ic_code':'end_code'}, inplace=True)
    df.drop(['区間名称','dorapura_name_x','dorapura_name_y',], axis=1, inplace=True)
    return df


def add_weather_info(df):
    weather = pd.read_csv(WEATHER_CSV, index_col=None, usecols=[1,2,3,4,7,8,9,11])
    df['年月日2'] = df['年月日2'].astype(str)
    df = df.merge(weather, left_on=['年月日2'], right_on=['年月日'], how='left')
    return df


def add_search_count_info(df):
    search_count = pd.read_csv(SEARCH_COUNT_TATEYAMA_CSV)
    date = []
    hour = []
    for i in tqdm(range(len(search_count))):
        date.append(search_count['passing_time'][i].split()[0])
        hour.append(int(search_count['passing_time'][i].split()[1].split(':')[0]))
    search_count['date'] = date
    search_count['hour'] = hour

    search_tmp = search_count.groupby(['date','hour','start_code','end_code']).sum().reset_index()['search']
    search_count = search_count.groupby(['date','hour','start_code','end_code']).mean().reset_index()
    search_count['search'] = search_tmp

    df.rename(columns={'年月日2':'date'}, inplace=True)
    df['start_code'].fillna(0, inplace=True)
    df['end_code'].fillna(0, inplace=True)
    df['start_code'] = df['start_code'].astype(int)
    df['end_code'] = df['end_code'].astype(int)
    df = df.merge(search_count, on=['start_code','end_code','date','hour'], how='left')
    df['date'] = pd.to_datetime(df['date'])
    return df
    

def add_unspecified_search_count_info(df, phase='train'):
    if phase == 'train':
        unspecified_df = pd.read_csv('Input_processed_data/search_count/search-count_tateyama_unspecified.csv')
    else:
        unspecified_df = pd.read_csv('Input_processed_data/search_count/search-count_tateyama_unspecified2.csv')
    unspecified_df.rename(columns={'search_date':'timestamp'}, inplace=True)
    unspecified_df['timestamp'] = pd.to_datetime(unspecified_df['timestamp'])
    unspecified_df.rename(columns={'timestamp':'date'}, inplace=True)
    unspecified_df['search'] = unspecified_df.groupby(['start_code','end_code'])['search'].shift(1)
    unspecified_df.rename(columns={'search': 'unspecified_search'}, inplace=True)
    df = df.merge(unspecified_df, on=['date','start_code','end_code'], how='left')
    df.drop(['年月日_y',], axis=1, inplace=True)
    df.rename(columns={'年月日_x': '年月日'}, inplace=True)
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    return df

In [23]:
def preprocess(df):
    df = df[['年', '月', '日', '曜日', '時刻', '平休1', '平休2（休日に土曜日含む）','方向', 'KP', '区間名称','全車','速度']]
    return df

#10行分で動作確認
df = pd.read_csv('../Input_original_data/traffic/館山道（202104-202203）.CSV', encoding='shift_jis', header=1).sample(10)

  df = pd.read_csv('../Input_original_data/traffic/館山道（202104-202203）.CSV', encoding='shift_jis', header=1).sample(10)


In [25]:
df

Unnamed: 0,年月日_x,年,月,日,KP,全車,大車,OCC,速度,全車(1),大車(1),車線率(1),OCC(1),速度(1),全車(2),大車(2),車線率(2),OCC(2),速度(2),エラー数.1,全車(追),大車(追),車線率(追),OCC(追),速度(追),全車(路),大車(路),車線率(路),OCC(路),速度(路),エラー数.3,hour,minute,曜日,平休1,平休2（休日に土曜日含む）,方向,date,start,goal,lat_start,lng_start,degree_start,start_code,lat_goal,lng_goal,degree_goal,end_code,年月日_y,平均気温(℃),降水量の合計(mm),日照時間(時間),平均風速(m/s),平均湿度(％),平均現地気圧(hPa),pref_code,search
0,2021-05-12 04:00:00,2021.0,5.0,12.0,83.025,0.0,0.0,0.0,,0.0,0.0,,0.0,126.0,,,,,,,0.0,0.0,,0.0,,,,,,,,4.0,0.0,水,平,平,上り,2021-05-12,富津中央,富津竹岡,35.24851,139.89325,4.0,1130041,35.19397,139.862353,4.0,1130046,2021-05-12,15.1,0.0,10.2,1.5,,,15,
1,2021-05-12 04:00:00,2021.0,5.0,12.0,83.025,0.0,0.0,0.0,,0.0,0.0,,0.0,126.0,,,,,,,0.0,0.0,,0.0,,,,,,,,4.0,0.0,水,平,平,上り,2021-05-12,富津中央,富津竹岡,35.24851,139.89325,4.0,1130041,35.19397,139.862353,4.0,1130046,2021-05-12,12.8,0.0,7.0,1.8,,,10,
2,2021-05-12 04:00:00,2021.0,5.0,12.0,83.025,0.0,0.0,0.0,,0.0,0.0,,0.0,126.0,,,,,,,0.0,0.0,,0.0,,,,,,,,4.0,0.0,水,平,平,上り,2021-05-12,富津中央,富津竹岡,35.24851,139.89325,4.0,1130041,35.19397,139.862353,4.0,1130046,2021-05-12,17.6,0.0,2.8,1.8,,,11,
3,2021-05-12 04:00:00,2021.0,5.0,12.0,83.025,0.0,0.0,0.0,,0.0,0.0,,0.0,126.0,,,,,,,0.0,0.0,,0.0,,,,,,,,4.0,0.0,水,平,平,上り,2021-05-12,富津中央,富津竹岡,35.24851,139.89325,4.0,1130041,35.19397,139.862353,4.0,1130046,2021-05-12,17.9,0.0,3.0,3.1,56.0,1018.1,13,
4,2021-05-12 04:00:00,2021.0,5.0,12.0,83.025,0.0,0.0,0.0,,0.0,0.0,,0.0,126.0,,,,,,,0.0,0.0,,0.0,,,,,,,,4.0,0.0,水,平,平,上り,2021-05-12,富津中央,富津竹岡,35.24851,139.89325,4.0,1130041,35.19397,139.862353,4.0,1130046,2021-05-12,17.9,0.0,5.5,3.7,52.0,1020.4,12,


In [24]:
df = fix_holiday(df)
df = add_date(df)
df = resample(df)
df = add_ic_info(df)
df = add_weather_info(df)
df = add_search_count_info(df)
# df = add_unspecified_search_count_info(df)

100%|██████████| 10/10 [00:00<00:00, 1031.61it/s]
100%|██████████| 10/10 [00:00<00:00, 46.86it/s]
100%|██████████| 1373775/1373775 [00:22<00:00, 60170.49it/s]
