In [14]:
import jpholiday
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
# data directory
PROCESSED_DATA_DIR = '../Input_processed_data'
ORI_DATA_DIR = '../Input_original_data'

# IC, 道路情報 csv
IC_SUB_CSV = f'{PROCESSED_DATA_DIR}/road_master/tateyama_kannetsu_ic.csv'
IC_NET_SUB_CSV = f'{PROCESSED_DATA_DIR}/road_master/tateyama_kannetsu_doronet_sub.csv'

# 交通量 csv
ORI_TRAFFIC_TATEYAMA_CSV = f'{ORI_DATA_DIR}/traffic/館山道（202104-202203）.CSV'
ORI_TRAFFIC_TATEYAMA_CSV2 = f'{ORI_DATA_DIR}/traffic/館山道TK5M.CSV'

ORI_TRAFFIC_KANNETSU_CSV = f'{ORI_DATA_DIR}/traffic/関越道（202104-202203）.CSV'
ORI_TRAFFIC_KANNETSU_CSV2 = f'{ORI_DATA_DIR}/traffic/関越道TK5M.CSV'

PROCESSED_TRAFFIC_TATEYAMA_CSV = f'{PROCESSED_DATA_DIR}/traffic/traffic_tateyama.csv'
PROCESSED_TRAFFIC_KANNETSU_CSV = f'{PROCESSED_DATA_DIR}/traffic/traffic_kannetsu.csv'

### 交通量データ前処理

In [3]:
df_ic = pd.read_csv(IC_SUB_CSV, dtype={'ic_code': str})
df_icnet = pd.read_csv(IC_NET_SUB_CSV, dtype={'start_code': str, 'end_code': str})

code2name = dict(zip(df_ic['ic_code'], df_ic['ic_name']))
name2code = {v: k for k, v in code2name.items()}

In [4]:
ic_subgraph = nx.from_pandas_edgelist(df_icnet, source='start_code', target='end_code',
                                      edge_attr=['distance', 'road_code', 'direction'],
                                      create_using=nx.DiGraph())

In [5]:
def to_datetime(df):
    dates = df.apply(lambda r: dt.date(r['年'], r['月'], r['日']), axis=1)
    times = df['時刻'].apply(lambda t: dt.datetime.strptime(t, '%H:%M').time())
    df['datetime'] = [dt.datetime.combine(d, t) for d, t in zip(dates, times)]
    return df


def fix_holiday(df):
    f = lambda r: jpholiday.is_holiday(r['datetime']) and r['平休2（休日に土曜日含む）'] == '平'
    df.loc[df[['datetime', '平休2（休日に土曜日含む）']].apply(f, axis=1), '平休2（休日に土曜日含む）'] = '休'
    return df


def fix_ic_section(df):
    # 「区間名称」をstart, endで2列に区切る
    sections = df['区間名称'].str.replace('\s', '', regex=True).str.split('〜')
    df['start_name'] = sections.map(lambda sec: sec[0])
    df['end_name'] = sections.map(lambda sec: sec[1])
    df.drop('区間名称', axis=1, inplace=True)

    # 方向が「上り」のものはstart, endを入れ替える
    up = df[df['方向'] == '上り'].reset_index(drop=True)
    down = df[df['方向'] == '下り'].reset_index(drop=True)
    up.rename(columns={
        'end_name': 'start_name',
        'start_name': 'end_name',
    }, inplace=True)
    df = pd.concat((up, down), ignore_index=True)

    return df

In [6]:
def preprocess(df):
  usecols = ['年', '月', '日', '曜日', '時刻', '平休1', '平休2（休日に土曜日含む）','方向', 'KP', '区間名称', '全車', '速度']
  df = df.loc[:, usecols]
  
  # 日時情報をdatetime型へ変換
  df = to_datetime(df)
  # 休日フラグを修正
  df = fix_holiday(df)
  # 「区間名称」をstart, endで2列に区切る
  df = fix_ic_section(df)
  # いらないカラムを落とす
  df.drop(['年', '月', '日', '曜日', '時刻'], axis=1, inplace=True)

  return df


def postprocess(df):
  # 列名変更
  df.rename({'平休2（休日に土曜日含む）': 'is_holiday', '方向': 'direction', '全車': 'total', '速度': 'speed'}, axis=1, inplace=True)

  # マスタデータに存在しない区間名を変更
  missing_ic_name_map = dict(zip(df_ic['dorapura_name'], df_ic['ic_name']))
  df['start_name'] = df['start_name'].map(lambda n: missing_ic_name_map[n] if n not in name2code else n)
  df['end_name'] = df['end_name'].map(lambda n: missing_ic_name_map[n] if n not in name2code else n)

  # ICコードを加える
  df['start_code'] = df['start_name'].map(lambda n: name2code[n])
  df['end_code'] = df['end_name'].map(lambda n: name2code[n])

  # 列抽出
  usecols = ['datetime', 'is_holiday', 'start_name', 'end_name', 'start_code', 'end_code', 'direction', 'total', 'speed', 'KP']
  df = df.loc[:, usecols]

  return df

In [79]:
reader = pd.read_csv(ORI_TRAFFIC_TATEYAMA_CSV, header=1, encoding='sjis', chunksize=10000)

df_tateyama = pd.concat((preprocess(r) for r in reader), ignore_index=True)
df_tateyama = postprocess(df_tateyama)

df_tateyama.to_csv(PROCESSED_TRAFFIC_TATEYAMA_CSV, index=False)
df_tateyama.head()

Unnamed: 0,datetime,is_holiday,start_name,end_name,start_code,end_code,direction,total,speed,KP
0,2021-04-01 00:00:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,8.0,92.0,39.17
1,2021-04-01 00:05:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,4.0,88.0,39.17
2,2021-04-01 00:10:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,9.0,83.0,39.17
3,2021-04-01 00:15:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,8.0,94.0,39.17
4,2021-04-01 00:20:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,5.0,115.0,39.17


In [80]:
reader = pd.read_csv(ORI_TRAFFIC_TATEYAMA_CSV2, header=1, encoding='sjis', chunksize=10000)

df_tateyama = pd.concat((preprocess(r) for r in reader), ignore_index=True)
df_tateyama = postprocess(df_tateyama)

df_tateyama.to_csv(PROCESSED_TRAFFIC_TATEYAMA_CSV, mode='a', header=False, index=False)
df_tateyama.head()

Unnamed: 0,datetime,is_holiday,start_name,end_name,start_code,end_code,direction,total,speed,KP
0,2022-04-01 00:00:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,4,83.0,39.17
1,2022-04-01 00:05:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,12,77.0,39.17
2,2022-04-01 00:10:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,9,92.0,39.17
3,2022-04-01 00:15:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,5,82.0,39.17
4,2022-04-01 00:20:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,9,90.0,39.17


In [81]:
reader = pd.read_csv(ORI_TRAFFIC_KANNETSU_CSV, header=1, encoding='sjis', chunksize=10000)

df_kannetsu = pd.concat((preprocess(r) for r in reader), ignore_index=True)
df_kannetsu = postprocess(df_kannetsu)

df_kannetsu.to_csv(PROCESSED_TRAFFIC_KANNETSU_CSV, index=False)
df_kannetsu.head()

In [9]:
reader = pd.read_csv(ORI_TRAFFIC_KANNETSU_CSV2, header=1, encoding='sjis', chunksize=10000)

df_kannetsu = pd.concat((preprocess(r) for r in reader), ignore_index=True)
df_kannetsu = postprocess(df_kannetsu)

df_kannetsu.to_csv(PROCESSED_TRAFFIC_KANNETSU_CSV, mode='a', header=False, index=False)
df_kannetsu.head()

Unnamed: 0,datetime,is_holiday,start_name,end_name,start_code,end_code,direction,total,speed,KP
0,2022-04-01 00:00:00,平,所沢,大泉ＪＣＴ,1800006,1110210,上り,88.0,84.0,5.0
1,2022-04-01 00:05:00,平,所沢,大泉ＪＣＴ,1800006,1110210,上り,69.0,84.0,5.0
2,2022-04-01 00:10:00,平,所沢,大泉ＪＣＴ,1800006,1110210,上り,54.0,87.0,5.0
3,2022-04-01 00:15:00,平,所沢,大泉ＪＣＴ,1800006,1110210,上り,35.0,86.0,5.0
4,2022-04-01 00:20:00,平,所沢,大泉ＪＣＴ,1800006,1110210,上り,47.0,86.0,5.0


In [10]:
!tail ../Input_processed_data/traffic/traffic_kannetsu.csv

2022-05-08 23:10:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,24.0,94.0,237.7
2022-05-08 23:15:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,17.0,97.0,237.7
2022-05-08 23:20:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,19.0,101.0,237.7
2022-05-08 23:25:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,17.0,94.0,237.7
2022-05-08 23:30:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,13.0,94.0,237.7
2022-05-08 23:35:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,8.0,90.0,237.7
2022-05-08 23:40:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,14.0,90.0,237.7
2022-05-08 23:45:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,12.0,86.0,237.7
2022-05-08 23:50:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,19.0,99.0,237.7
2022-05-08 23:55:00,休,小千谷,長岡南越路スマート,1800176,1800183,下り,18.0,82.0,237.7


### 区間情報のズレを埋める

In [7]:
df_tateyama = pd.read_csv(PROCESSED_TRAFFIC_TATEYAMA_CSV, index_col='datetime', parse_dates=True, 
                          dtype={'start_code': str, 'end_code': str}).reset_index()

df_kannetsu = pd.read_csv(PROCESSED_TRAFFIC_KANNETSU_CSV, index_col='datetime', parse_dates=True, 
                          dtype={'start_code': str, 'end_code': str}).reset_index()

In [16]:
def get_diff_sections(df):
  sections = df[['start_code', 'end_code']].drop_duplicates().values
  diff_sections = [sec for sec in sections if not ic_subgraph.has_edge(sec[0], sec[1])]
  return diff_sections


def sec2path(sections):
  sec2path_mapper = {}
  for sec in sections:
    p = nx.shortest_path(ic_subgraph, *sec)
    sec2path_mapper[tuple(sec)] = [(p[i], p[i+1]) for i in range(len(p)-1)]

  return sec2path_mapper


def fill_missing_sections(df):
  diff_sections = get_diff_sections(df)
  sec2path_mapper = sec2path(diff_sections)

  set_diff_sections = set(diff_sections)
  is_diff_sections = df[['start_code', 'end_code']].apply(lambda sec: tuple(sec) in set_diff_sections, axis=1)
  
  # 交通量データ内の異なる区間情報を持つ各行について、その区間を分解されたものへと変更し、新たな行を作成する
  new_rows = []
  for _, row in df[is_diff_sections].iterrows():
  # 交通データ側の異なる区間を検索データ側の区間列に直す
    path_list = sec2path_mapper[(row['start_code'], row['end_code'])]

    for sec in path_list:
      new_r = row.to_list()
      new_r[2:4] = code2name[sec[0]], code2name[sec[1]]
      new_r[4:6] = sec
      new_rows.append(new_r)

  df_new = pd.DataFrame(new_rows, columns=df.columns)

  df_processed = pd.concat([df[~is_diff_sections], df_new]).sort_values('datetime').reset_index(drop=True)

  return df_processed

In [13]:
df_tateyama_processed = fill_missing_sections(df_tateyama)
df_tateyama_processed.to_csv(f'{PROCESSED_DATA_DIR}/traffic/traffic_tateyama_filled.csv', index=False)
df_tateyama_processed

Unnamed: 0,datetime,is_holiday,start_name,end_name,start_code,end_code,direction,total,speed,KP
0,2021-04-01 00:00:00,平,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,8.0,92.0,39.170
1,2021-04-01 00:00:00,平,京葉道路・館山自動車道接続部,市原,1130001,1130006,下り,18.0,91.0,39.170
2,2021-04-01 00:00:00,平,木更津北,木更津ＪＣＴ,1130021,1130026,下り,7.0,82.0,61.520
3,2021-04-01 00:00:00,平,木更津ＪＣＴ,木更津南ＪＣＴ,1130026,1130031,下り,10.0,87.0,64.420
4,2021-04-01 00:00:00,平,富津竹岡,富津中央,1130046,1130041,上り,3.0,77.0,83.025
...,...,...,...,...,...,...,...,...,...,...
2089147,2022-05-08 23:55:00,休,市原,京葉道路・館山自動車道接続部,1130006,1130001,上り,8.0,87.0,39.170
2089148,2022-05-08 23:55:00,休,富津中央,富津竹岡,1130041,1130046,下り,4.0,88.0,83.070
2089149,2022-05-08 23:55:00,休,市原,市原ＳＡ,1130006,1130011,下り,11.0,87.0,46.270
2089150,2022-05-08 23:55:00,休,京葉道路・館山自動車道接続部,市原,1130001,1130006,下り,17.0,93.0,39.170


In [20]:
df_kannetsu_processed = fill_missing_sections(df_kannetsu)
df_kannetsu_processed.to_csv(f'{PROCESSED_DATA_DIR}/traffic/traffic_kannetsu_filled.csv', index=False)
df_kannetsu_processed

Unnamed: 0,datetime,is_holiday,start_name,end_name,start_code,end_code,direction,total,speed,KP
0,2021-04-01 00:00:00,平,三芳ＰＡ,所沢,1800011,1800006,上り,78.0,85.0,10.21
1,2021-04-01 00:00:00,平,前橋,駒寄ＰＡ,1800081,1800086,下り,17.0,89.0,96.60
2,2021-04-01 00:00:00,平,高崎,前橋,1800076,1800081,下り,32.0,88.0,88.20
3,2021-04-01 00:00:00,平,新座本線,所沢,1800004,1800006,下り,57.0,87.0,3.90
4,2021-04-01 00:00:00,平,大泉ＪＣＴ,新座本線,1110210,1800004,下り,57.0,87.0,3.90
...,...,...,...,...,...,...,...,...,...,...
10097563,2022-05-08 23:55:00,休,三芳ＰＡ,川越,1800011,1800016,下り,55.0,96.0,16.38
10097564,2022-05-08 23:55:00,休,赤城,赤城高原ＳＡ,1800096,1800101,下り,16.0,92.0,113.58
10097565,2022-05-08 23:55:00,休,赤城高原ＳＡ,昭和,1800101,1800106,下り,16.0,92.0,113.58
10097566,2022-05-08 23:55:00,休,嵐山小川,東松山,1800041,1800036,上り,51.0,94.0,44.50
