In [None]:
import pandas as pd
import gc

In [None]:
df = pd.read_parquet("/workspace/resources/input/task1_dataset_raw_train.parquet")

In [None]:
test_uids = df.loc[df["x"] == 999, "uid"].unique()
df = df[~df["uid"].isin(test_uids)].reset_index(drop=True)
uids = df["uid"].unique()
df = df[df["uid"].isin(uids[:10000])]

In [None]:
tr_uids = uids[:5000]
va_uids = uids[5000:]

train_df = df[df["uid"].isin(tr_uids)].reset_index(drop=True)
valid_df = df[df["uid"].isin(va_uids)].reset_index(drop=True)
del df
gc.collect()

In [None]:
def assign_day_of_week(df):
    df["dayofweek"] = (df["d"] % 7).astype(int)
    df["weekend"] = df["dayofweek"].isin([6, 0])
    return df


def assign_t_labe(df):
    morning = {k: 0 for k in list(range(12, 36))}
    midnight = {k: 1 for k in list(range(36, 48)) + list(range(0, 12))}
    t_label_mapping = {**morning, **midnight}
    df["t_label"] = df["t"].map(t_label_mapping)
    return df

def assign_detailed_t_label(df):
    division = 48 // 12  # 48を12で割った値
    result_dict = {i: i // division for i in range(48)}
    df["detailed_t_label"] = df["t"].map(result_dict)
    return df
    

train_df = assign_day_of_week(train_df)
train_df = assign_t_labe(train_df)
train_df = assign_detailed_t_label(train_df)

valid_df = assign_day_of_week(valid_df)
valid_df = assign_t_labe(valid_df)
valid_df = assign_detailed_t_label(valid_df)

keys = [
    "uid", 
    "weekend", 
    "t",
    ]
agg_df = train_df.query("d < 60").groupby(keys)[["x", "y"]].agg("mean").reset_index().rename(columns={"x":"agg_x", "y":"agg_y"})
if "agg_x" in train_df.columns:
    train_df.drop("agg_x", axis=1, inplace=True)
    train_df.drop("agg_y", axis=1, inplace=True)
train_df = pd.merge(train_df, agg_df, on=keys, how="left")
train_df.head()

In [None]:
preds_df = train_df.query("d >= 60").dropna().reset_index(drop=True)
preds_df

In [None]:
from sklearn.metrics import mean_squared_error
import geobleu
from tqdm import tqdm

In [None]:
reference = preds_df[["uid", "d", "t", "x", "y"]]
generated = preds_df[["uid", "d", "t", "agg_x", "agg_y"]].rename(columns={"agg_x":"x", "agg_y":"y"})

In [None]:
geobleu_score = 0
dtw_score = 0

eval_uids = reference["uid"].unique()[:100]
for uid in tqdm(eval_uids):
    a_generated = generated.loc[generated["uid"] == uid, ["d", "t", "x", "y"]].values.tolist()
    a_reference = reference.loc[reference["uid"] == uid, ["d", "t", "x", "y"]].values.tolist()
    
    geobleu_score += geobleu.calc_geobleu(a_generated, a_reference, processes=3)
    dtw_score += geobleu.calc_dtw(a_generated, a_reference, processes=3)

geobleu_score = geobleu_score / len(eval_uids)
dtw_score = dtw_score / len(eval_uids)

print(f"geobleu_score={geobleu_score}, dtw_score={dtw_score}")

rmse = mean_squared_error(y_true=reference[["x", "y"]].values, y_pred=generated[["x", "y"]].values, squared=False)
print(f"rmse={rmse}")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_uid(df, uids):
    """
    df: DataFrame - 入力データ
    uids: list - グラフを出力したいuidのリスト
    """
        
    # d >= 60 の部分の色を変えるための列
    df['color_condition'] = df['d'] >= 60

    n_uids = len(uids)
    fig, axs = plt.subplots(n_uids, 2, figsize=(15, 5*n_uids))

    for i, uid in enumerate(uids):
        df_uid = df[df['uid'] == uid]

        for j, col in enumerate(['x', 'y']):
            mean_val = df_uid[col].mean()
            
            # 平均値の線
            axs[i, j].axhline(mean_val, color='green', linestyle='--', label='Mean')
            
            # 平均値±10の線
            axs[i, j].axhline(mean_val + 10, color='purple', linestyle=':', label='Mean + 10')
            axs[i, j].axhline(mean_val - 10, color='purple', linestyle=':', label='Mean - 10')
            
            sns.lineplot(x='time', y=col, hue='color_condition', data=df_uid, palette=["blue", "red"], ax=axs[i, j], legend=False, label=col)
            
            # agg_x または agg_y のデータを追加
            if col == 'x':
                sns.lineplot(x='time', y='agg_x', data=df_uid, color='orange', ax=axs[i, j], label='agg_x')
            else:
                sns.lineplot(x='time', y='agg_y', data=df_uid, color='cyan', ax=axs[i, j], label='agg_y')
            
            axs[i, j].set_title(f'UID {uid} - {col} value over time')
            axs[i, j].tick_params(axis='x', rotation=45)
            axs[i, j].grid(True)
            axs[i, j].set_facecolor("#f5f5f5")
            axs[i, j].legend()
    
    plt.tight_layout()
    plt.show()

train_df["time"] = (train_df["d"].astype(str).str.zfill(2) + train_df["t"].astype(str).str.zfill(2)).astype(int)
uids = pd.Series(train_df["uid"].unique()).sample(5, random_state=None)
plot_uid(train_df, uids)
