## ルールベースによる予測

### 設定

In [None]:
import pandas as pd
import numpy as np

import geobleu

from IPython.display import clear_output

### 定数定義

In [None]:
INPUT_PATH = "../../data/cityB_challengedata_interpolated.csv"

# 予測を行うuid区間（デバッグ時など、処理時間短縮が必要な場合に設定）
UID_VALID_S = 20000
UID_VALID_E = 21999

# 教師データの区間数
D_TRAIN_RANGE = 60

# 1日を30分ごとの時間帯に分割したときの区間数
T_RANGE = 48

# 7:00～20:00を日中と定義
MORNING_T = 14 # AM7時
NIGHT_T = 40 # PM8時

# 曜日の数（不変）
DOW_COUNT = 7

### データ読み込み・分割

In [None]:
df = pd.read_csv(INPUT_PATH)
df

In [None]:
# 曜日カラム
df["dow"] = df["d"] % DOW_COUNT
# 日中/夜間カラム
df["t_label"] = (
    df["t"]
    .apply(lambda x: "daytime" if MORNING_T <= x < NIGHT_T else "nighttime")  
)
df

In [None]:
# uidが20000~21999のデータを検証に利用
df_valid = (
    df
    .loc[df["uid"].between(UID_VALID_S, UID_VALID_E)]
)
df_valid

In [None]:
# 教師データと予測対象データを分割
df_train = (
    df_valid
    .loc[df_valid["d"] < D_TRAIN_RANGE]
)

df_target = (
    df_valid
    .loc[df_valid["d"] >= D_TRAIN_RANGE]
)

### 欠損値補完テーブル作成

In [None]:
# uid×dow×t毎に最頻値を算出
df_dow_t_mode = (
    df_train
    .groupby(["uid", "dow", "t"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )
    .reset_index()

    .rename(
        columns={"x": "dow_t_x", "y": "dow_t_y"}
    )
)
df_dow_t_mode

In [None]:
# uid×t毎に最頻値を算出
df_t_mode = (
    df_train
    .groupby(["uid", "t"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )
    .reset_index()

    .rename(
        columns={"x": "t_x", "y": "t_y"}
    )
)
df_t_mode

In [None]:
# uid×dow×t_label毎に最頻値を算出
df_dow_t_label_mode = (
    df_train
    .groupby(["uid", "dow", "t_label"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )
    .reset_index()

    .rename(
        columns={"x": "dow_t_label_x", "y": "dow_t_label_y"}
    )
)
df_dow_t_label_mode

In [None]:
# uid×t_label毎に最頻値を算出
df_t_label_mode = (
    df_train
    .groupby(["uid", "t_label"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )
    .reset_index()

    .rename(
        columns={"x": "t_label_x", "y": "t_label_y"}
    )
)
df_t_label_mode

In [None]:
# uid毎に最頻値を算出
df_uid_mode = (
    df_train
    .groupby(["uid"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )
    .reset_index()

    .rename(
        columns={"x": "uid_x", "y": "uid_y"}
    )
)
df_uid_mode

### 予測

In [None]:
# 欠損値補完テーブルを結合
df_pred = (
    df_target
    .merge(df_dow_t_mode, on=["uid", "dow", "t"], how="left")
    .merge(df_t_mode, on=["uid", "t"], how="left")
    .merge(df_dow_t_label_mode, on=["uid", "dow", "t_label"], how="left")
    .merge(df_t_label_mode, on=["uid", "t_label"], how="left")
    .merge(df_uid_mode, on=["uid"], how="left")
)
df_pred

In [None]:
# 優先順位をつけて最終的な予測値を算出
df_pred["pred_x"] = (
    df_pred["dow_t_x"]
    .fillna(df_pred["t_x"])
    .fillna(df_pred["dow_t_label_x"])
    .fillna(df_pred["t_label_x"])
    .fillna(df_pred["uid_x"])
)

df_pred["pred_y"] = (
    df_pred["dow_t_y"]
    .fillna(df_pred["t_y"])
    .fillna(df_pred["dow_t_label_y"])
    .fillna(df_pred["t_label_y"])
    .fillna(df_pred["uid_y"])
)

In [None]:
# 最終的なdf
df_pred[["uid", "d", "t", "x", "y", "pred_x", "pred_y"]]

### 精度検証

In [None]:
list_geobleu_val = []
list_dtw_val = []

for uid in range(UID_VALID_S, UID_VALID_E+1):
    clear_output(True)
    print(f"処理中のuid:{uid}")
    
    # 該当uidのデータを抽出
    df_pred_uid = (
        df_pred
        .loc[df_pred["uid"] == uid]
    )

    # 予測値のリスト
    df_pred_xy = df_pred_uid[["d", "t", "pred_x", "pred_y"]]
    list_pred = [tuple(row) for row in df_pred_xy.to_records(index=False)]

    # 正解値のリスト
    df_true_xy = df_pred_uid[["d", "t", "x", "y"]]
    list_true = [tuple(row) for row in df_true_xy.to_records(index=False)]

    # スコア算出
    geobleu_val = geobleu.calc_geobleu(list_pred, list_true, processes=3)
    list_geobleu_val.append(geobleu_val)
    
    dtw_val = geobleu.calc_dtw(list_pred, list_true, processes=3)
    list_dtw_val.append(dtw_val)

In [None]:
print(f"geobleu:{np.mean(list_geobleu_val)}")
print(f"dtw:{np.mean(list_dtw_val)}")