## cityB validation

### 設定

In [1]:
# BLEUのインストール
!git clone https://github.com/yahoojapan/geobleu.git
!pip install /kaggle/working/geobleu/

Cloning into 'geobleu'...
remote: Enumerating objects: 156, done.[K
remote: Counting objects: 100% (156/156), done.[K
remote: Compressing objects: 100% (107/107), done.[K
remote: Total 156 (delta 75), reused 126 (delta 46), pack-reused 0 (from 0)[K
Receiving objects: 100% (156/156), 26.72 KiB | 5.34 MiB/s, done.
Resolving deltas: 100% (75/75), done.
Processing ./geobleu
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: geobleu
  Building wheel for geobleu (setup.py) ... [?25l- done
[?25h  Created wheel for geobleu: filename=geobleu-0.3-py3-none-any.whl size=5037 sha256=07f9a8a2639ae3d362902a2b61783338abf8b22d063a5e4e7cb0ad4d4c4d5615
  Stored in directory: /tmp/pip-ephem-wheel-cache-htpkcmov/wheels/ed/5d/64/4ba68bf198dc931381fb27d7eec45b1a2de54ca3eec437763a
Successfully built geobleu
Installing collected packages: geobleu
Successfully installed geobleu-0.3


In [2]:
import pandas as pd
import numpy as np
import geobleu

### 定数定義

In [3]:
# 7:00～20:00を日中と定義
MORNING_T = 14 # AM7時
NIGHT_T = 40 # PM8時

# 曜日の数（不変）
DOW_COUNT = 7

# 欠損補完用　連続する欠損時間
INTEROILATE_N = 10

In [4]:
# INPUT_PATH = "../../01_public/humob-challenge-2024/input/cityB_challengedata.csv.gz"
INPUT_PATH = "/kaggle/input/humob2024/cityB_challengedata.csv/hiroshima_challengedata.csv"

### データ読み込み・分割

In [5]:
df_city_b = pd.read_csv(INPUT_PATH)
df_city_b.head(3)

Unnamed: 0,uid,d,t,x,y
0,0,0,20,80,99
1,0,0,21,81,97
2,0,0,25,83,102


In [6]:
# df_city_b["dow"] = df_city_b["d"] % DOW_COUNT
# df_city_b["t_label"] = (
#     df_city_b["t"]
#     .apply(lambda x: "daytime" if MORNING_T <= x < NIGHT_T else "nighttime")  
# )
# df_city_b.head(3)

In [7]:
# uidが20000~21999のデータを検証に利用
df_city_b_valid = (
    df_city_b
    .loc[df_city_b["uid"].between(20000, 21999)]
#     .loc[df_city_b["uid"].between(20000, 20010)]

)
df_city_b_valid.head(3)

Unnamed: 0,uid,d,t,x,y
20253615,20000,0,0,79,89
20253616,20000,0,2,79,89
20253617,20000,0,5,79,89


In [8]:
# dが60前後でデータを分割
df_city_b_train = (
    df_city_b_valid
    .loc[df_city_b_valid["d"] < 60]
)

df_city_b_answer = (
    df_city_b_valid
    .loc[df_city_b_valid["d"] >= 60]
)

### trainの欠損値を線形補完

In [9]:
def get_full_df(df, d_range, t_range):
    """
    input: uid, d, t, x, yのdf
    output: 全uidでd×tを揃えたdfを作成。x, yが欠損値の場合はnull。
    """
    # 全てのd, tの範囲のデータフレームを作成
    full_range_d = pd.DataFrame({'d': range(0, d_range)})
    full_range_t = pd.DataFrame({'t': range(0, t_range)})

    # uidのユニークな値を取得し、クロス結合を行う
    full_range_uid = pd.DataFrame({'uid': df['uid'].unique()})

    # クロス結合を行う（全てのuid, d, tの組み合わせ）
    full_range = (full_range_uid.assign(key=1)
                  .merge(full_range_d.assign(key=1), on='key')
                  .merge(full_range_t.assign(key=1), on='key')
                  .drop('key', axis=1)
                 )

    # x, yの値を付与し、欠損値はnullとする
    df_train_full = (full_range
                     .merge(df, on=['uid', 'd', 't'], how='left')
                     .sort_values(by=['uid', 'd', 't'])
                    )

    return df_train_full

In [10]:
def interpolate_missing_values(df):
    """
    input: uid, d, t, x, yのdf、uidにつきd×tは60×48、x, y欠損値はnull
    output: 3時間未満の連続するx, y欠損を直近の前後のx, y値を用いて線形補完したdf
    """
    df_fill = df.fillna(-1).copy()  # 欠損値を一時的に -1 に変換
    skip_list = []
    interpolate_num = 0
    
    for i in range(len(df_fill)):
#     for i in range(20):
        prev_valid = None
        next_valid = None
        time_diff = None
        
        # skip_listのインデックスは飛ばす
        if i in skip_list:
            continue
        
        # x, yが欠損の場合
        if df_fill.loc[i, "x"] == -1 and df_fill.loc[i, "y"] == -1:
            # 前の有効なデータを探す
            if prev_valid is None and i != 0:
                if df_fill.loc[i - 1, "x"] != -1:
                    prev_valid = df_fill.loc[i - 1]  # 前のデータが有効な場合
            
            # 次の有効なデータを探す
            if next_valid is None:
                for j in range(i + 1, len(df)):
                    if df_fill.loc[j, "x"] != -1 and df_fill.loc[j, "y"] != -1:
                        next_valid = df_fill.loc[j]
                        break

            # 前後の有効データが見つかった場合
            if prev_valid is not None and next_valid is not None:
#                 print(i, prev_valid, next_valid)
                time_diff = int(next_valid["t"] - prev_valid["t"])
                
                # 3時間未満の欠損なら線形補完
                if (time_diff != 0) and (time_diff < N*2):
                    x_diff = next_valid["x"] - prev_valid["x"]
                    y_diff = next_valid["y"] - prev_valid["y"]
                    
                    for j in range(i, i + time_diff -1):
                        time_from_prev = df.loc[j, "t"] - prev_valid["t"]
                        # 欠損値を線形補完
                        df_fill.loc[j, "x"] = prev_valid["x"] + (x_diff / time_diff) * time_from_prev
                        df_fill.loc[j, "y"] = prev_valid["y"] + (y_diff / time_diff) * time_from_prev
                    
                    # 補完済みのインデックスをリストに追加
                    skip_list = [j for j in range(i, i + time_diff - 1)]
                    interpolate_num += len(skip_list)
#                     print(skip_list)
#                     print(df_fill.iloc[i-1:i+time_diff+3,:])
                    
#                 else:
#                     print(f"Skipping interpolation for index {i}.")
    print(f"補完数：{interpolate_num}")
    # 欠損値が補完されたdfを返す
    df_fill = df_fill[df_fill["x"]!=-1]
    df_fill["x"] =df_fill["x"].astype(int)
    df_fill["y"] =df_fill["y"].astype(int)

    
    return df_fill

In [11]:
df_city_b_train_full = get_full_df(df_city_b_train, d_range=60, t_range=48)
df_city_b_train_full

Unnamed: 0,uid,d,t,x,y
0,20000,0,0,79.0,89.0
1,20000,0,1,,
2,20000,0,2,79.0,89.0
3,20000,0,3,,
4,20000,0,4,,
...,...,...,...,...,...
5759995,21999,59,43,,
5759996,21999,59,44,,
5759997,21999,59,45,,
5759998,21999,59,46,,


In [12]:
df_city_b_train_ip = interpolate_missing_values(df_city_b_train_full)

補完数：1269880


In [13]:
len(df_city_b_train_full)

5760000

In [14]:
def add_day_night_time(df):
    df["dow"] = df["d"] % DOW_COUNT
    df["t_label"] = (
        df["t"]
        .apply(lambda x: "daytime" if MORNING_T <= x < NIGHT_T else "nighttime")  
    )
    return df
df_city_b_answer = add_day_night_time(df_city_b_answer)
df_city_b_train_ip = add_day_night_time(df_city_b_train_ip)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["dow"] = df["d"] % DOW_COUNT
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["t_label"] = (


In [15]:
df_city_b_train_full.head(10),df_city_b_train_ip.head(10)

(     uid  d  t     x     y
 0  20000  0  0  79.0  89.0
 1  20000  0  1   NaN   NaN
 2  20000  0  2  79.0  89.0
 3  20000  0  3   NaN   NaN
 4  20000  0  4   NaN   NaN
 5  20000  0  5  79.0  89.0
 6  20000  0  6   NaN   NaN
 7  20000  0  7   NaN   NaN
 8  20000  0  8   NaN   NaN
 9  20000  0  9   NaN   NaN,
      uid  d  t   x   y  dow    t_label
 0  20000  0  0  79  89    0  nighttime
 1  20000  0  1  79  89    0  nighttime
 2  20000  0  2  79  89    0  nighttime
 3  20000  0  3  79  89    0  nighttime
 4  20000  0  4  79  89    0  nighttime
 5  20000  0  5  79  89    0  nighttime
 6  20000  0  6  79  89    0  nighttime
 7  20000  0  7  79  89    0  nighttime
 8  20000  0  8  79  89    0  nighttime
 9  20000  0  9  79  89    0  nighttime)

In [16]:
df_city_b_train_ip.head()

Unnamed: 0,uid,d,t,x,y,dow,t_label
0,20000,0,0,79,89,0,nighttime
1,20000,0,1,79,89,0,nighttime
2,20000,0,2,79,89,0,nighttime
3,20000,0,3,79,89,0,nighttime
4,20000,0,4,79,89,0,nighttime


In [17]:
df_city_b_answer.head()

Unnamed: 0,uid,d,t,x,y,dow,t_label
20254678,20000,60,11,79,89,4,nighttime
20254679,20000,60,15,79,89,4,daytime
20254680,20000,60,19,79,89,4,daytime
20254681,20000,60,20,79,89,4,daytime
20254682,20000,60,21,79,89,4,daytime


In [18]:
df_city_b_train =df_city_b_train_ip

### 欠損値補完テーブル作成

In [19]:
# uid×dow×t毎に最頻値を算出
df_dow_t_mode = (
    df_city_b_train
    .groupby(["uid", "dow", "t"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )
    .reset_index()

    .rename(
        columns={"x": "dow_t_x", "y": "dow_t_y"}
    )
)
df_dow_t_mode.head(3)

Unnamed: 0,uid,dow,t,dow_t_x,dow_t_y
0,20000,0,0,79,88
1,20000,0,1,79,89
2,20000,0,2,79,89


In [20]:
# uid×t毎に最頻値を算出
df_t_mode = (
    df_city_b_train
    .groupby(["uid", "t"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )

    .reset_index()

    .rename(
        columns={"x": "t_x", "y": "t_y"}
    )
)
df_t_mode.head(3)

Unnamed: 0,uid,t,t_x,t_y
0,20000,0,79,89
1,20000,1,79,89
2,20000,2,79,89


In [21]:
# uid×dow×t_label毎に最頻値を算出
df_dow_t_label_mode = (
    df_city_b_train
    .groupby(["uid", "dow", "t_label"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )
    .reset_index()

    .rename(
        columns={"x": "dow_t_label_x", "y": "dow_t_label_y"}
    )
)
df_dow_t_label_mode.head(3)

Unnamed: 0,uid,dow,t_label,dow_t_label_x,dow_t_label_y
0,20000,0,daytime,79,89
1,20000,0,nighttime,79,89
2,20000,1,daytime,79,89


In [22]:
# uid×t_label毎に最頻値を算出
df_t_label_mode = (
    df_city_b_train
    .groupby(["uid", "t_label"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )

    .reset_index()

    .rename(
        columns={"x": "t_label_x", "y": "t_label_y"}
    )
)
df_t_label_mode.head(3)

Unnamed: 0,uid,t_label,t_label_x,t_label_y
0,20000,daytime,79,89
1,20000,nighttime,79,89
2,20001,daytime,78,99


In [23]:
# uid毎に最頻値を算出
df_uid_mode = (
    df_city_b_train
    .groupby(["uid"])
    [["x", "y"]]
    .agg(
        lambda x: x.mode().iloc[0]
    )

    .reset_index()

    .rename(
        columns={"x": "uid_x", "y": "uid_y"}
    )
)
df_uid_mode.head(3)

Unnamed: 0,uid,uid_x,uid_y
0,20000,79,89
1,20001,78,99
2,20002,80,100


### 予測

In [24]:
# 欠損値補完テーブルを結合
df_city_b_pred = (
    df_city_b_answer

    .merge(df_dow_t_mode, on=["uid", "dow", "t"], how="left")
    .merge(df_t_mode, on=["uid", "t"], how="left")
    .merge(df_dow_t_label_mode, on=["uid", "dow", "t_label"], how="left")
    .merge(df_t_label_mode, on=["uid", "t_label"], how="left")
    .merge(df_uid_mode, on=["uid"], how="left")
)

In [25]:
# 優先順位をつけて最終的な予測値を算出
df_city_b_pred["pred_x"] = (
    df_city_b_pred["dow_t_x"]
    .fillna(df_city_b_pred["t_x"])
    .fillna(df_city_b_pred["dow_t_label_x"])
    .fillna(df_city_b_pred["t_label_x"])
    .fillna(df_city_b_pred["uid_x"])
)

df_city_b_pred["pred_y"] = (
    df_city_b_pred["dow_t_y"]
    .fillna(df_city_b_pred["t_y"])
    .fillna(df_city_b_pred["dow_t_label_y"])
    .fillna(df_city_b_pred["t_label_y"])
    .fillna(df_city_b_pred["uid_y"])
)

In [26]:
# 最終的なdf
df_city_b_pred[["uid", "d", "t", "x", "y", "pred_x", "pred_y"]]

Unnamed: 0,uid,d,t,x,y,pred_x,pred_y
0,20000,60,11,79,89,79.0,89.0
1,20000,60,15,79,89,79.0,89.0
2,20000,60,19,79,89,80.0,88.0
3,20000,60,20,79,89,79.0,88.0
4,20000,60,21,79,89,79.0,89.0
...,...,...,...,...,...,...,...
342560,21999,74,27,80,143,80.0,143.0
342561,21999,74,28,80,143,80.0,143.0
342562,21999,74,36,80,143,80.0,143.0
342563,21999,74,37,81,144,80.0,143.0


### 精度検証

In [27]:
list_geobleu_val = []
list_dtw_val = []

for i in range(20000, 22000):
# for i in range(20000, 20010):
    
    
    # 該当uidのデータを抽出
    df = (
        df_city_b_pred
        .loc[df_city_b_pred["uid"] == i]
    )

    # 予測値のリスト
    df_pred = df[["d", "t", "pred_x", "pred_y"]]
    list_pred = [tuple(row) for row in df_pred.to_records(index=False)]

    # 正解値のリスト
    df_answer = df[["d", "t", "x", "y"]]
    list_answer = [tuple(row) for row in df_answer.to_records(index=False)]

    # スコア算出
    geobleu_val = geobleu.calc_geobleu(list_pred, list_answer, processes=3)
    list_geobleu_val.append(geobleu_val)
    
    dtw_val = geobleu.calc_dtw(list_pred, list_answer, processes=3)
    list_dtw_val.append(dtw_val)


In [28]:
print(f"geobleu:{np.mean(list_geobleu_val)}")
print(f"dtw:{np.mean(list_dtw_val)}")

geobleu:0.2654042410092569
dtw:39.03152235582942
