In [1]:
%cd /workspace

from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

/workspace


In [2]:
INPUT = Path("/workspace/resources/input")
task_df = pd.read_parquet(INPUT / "task2_dataset_raw_train.parquet")
task_df.shape

(26621236, 5)

In [3]:
train_df = task_df.query("d < 60").reset_index(drop=True)
test_df = task_df.query("d >= 60").reset_index(drop=True)

In [13]:
morning = {k:0 for k in list(range(12, 36))}
midnight = {k:1 for k in list(range(36, 48)) +list(range(0, 12))}
t_label_mapping = {**morning, **midnight}
task_df["t"].map(t_label_mapping)

0           0
1           0
2           0
3           0
4           0
           ..
26621231    1
26621232    1
26621233    1
26621234    1
26621235    1
Name: t, Length: 26621236, dtype: int64

In [10]:
midnight

{36: 1,
 37: 1,
 38: 1,
 39: 1,
 40: 1,
 41: 1,
 42: 1,
 43: 1,
 44: 1,
 45: 1,
 46: 1,
 47: 1,
 0: 1,
 1: 1,
 2: 1,
 3: 1,
 4: 1,
 5: 1,
 6: 1,
 7: 1,
 8: 1,
 9: 1,
 10: 1,
 11: 1}

In [40]:
agg_pred_df = (
    train_df.groupby("uid")[["x", "y"]]
    .median()
    .reset_index()
    .rename(columns={"x": "agg_x", "y": "agg_y"})
)

In [41]:
preds_df = pd.merge(test_df, agg_pred_df, on="uid", how="left")

In [42]:
preds_df

Unnamed: 0,uid,d,t,x,y,agg_x,agg_y
0,2381,60,14,155,99,153.0,97.0
1,2381,60,15,163,95,153.0,97.0
2,2381,60,16,165,93,153.0,97.0
3,2381,60,31,165,90,153.0,97.0
4,2381,60,32,176,89,153.0,97.0
...,...,...,...,...,...,...,...
4920685,1792,74,42,75,135,75.0,135.0
4920686,1792,74,43,75,135,75.0,135.0
4920687,1792,74,45,75,135,75.0,135.0
4920688,1792,74,46,75,135,75.0,135.0


In [43]:
mean_squared_error(
    y_true=preds_df[["x", "y"]].values,
    y_pred=preds_df[["agg_x", "agg_y"]],
    squared=False,
)

18.288804057251248

In [54]:
from geobleu import geobleu

In [96]:
sample_df = preds_df.sample(3000, random_state=None).reset_index(drop=True)
uids = preds_df["uid"].unique()[:10]
sample_df = preds_df[preds_df["uid"].isin(uids)]
print(sample_df.shape)

generated = sample_df[["d", "t", "agg_x", "agg_y"]].values.tolist()
reference = sample_df[["d", "t", "x", "y"]].values.tolist()

print(
    mean_squared_error(
        y_true=np.array(reference)[:, 2:],
        y_pred=np.array(generated)[:, 2:],
        squared=False,
    )
)
print(geobleu.calc_geobleu(generated, reference, processes=3))
print(geobleu.calc_dtw(generated, reference, processes=3))

(1451, 7)
22.19261281300223
0.04400734540859542
753.8944969804397


In [97]:
generated = sample_df[["d", "t", "x", "y"]].values
N = generated.shape[0]
a = np.array(list(range(N)))
b = pd.Series(a).sample(frac=0.5).to_numpy()
c = np.array(list(set(a) - set(b)))

generated[b, 2:] = generated[b, 2:] + 22
generated[c, 2:] = generated[c, 2:] - 22

reference = sample_df[["d", "t", "x", "y"]].values

print(
    mean_squared_error(
        y_true=np.array(reference)[:, 2:],
        y_pred=np.array(generated)[:, 2:],
        squared=False,
    )
)
print(geobleu.calc_geobleu(generated.tolist(), reference.tolist(), processes=3))
print(geobleu.calc_dtw(generated.tolist(), reference.tolist(), processes=3))

22.0
3.993111019597796e-05
1494.3116679191319


In [70]:
generated[0]

array([ 68,  10, 113,  37])

In [71]:
reference[0]

array([ 68,  10, 108,  32])

### 平均値 dict


In [107]:
df = task_df.groupby("uid")[["x", "y"]].agg(["mean", "median"])
df.columns = [f"{c[0]}_{c[1]}" for c in df.columns]
df.reset_index()

Unnamed: 0,uid,x_mean,x_median,y_mean,y_median
0,0,153.197119,163.0,66.939348,61.0
1,1,87.556010,84.0,90.565004,88.0
2,2,74.795282,68.0,86.117553,84.0
3,3,109.277736,119.0,98.190189,78.0
4,4,138.720566,137.0,62.939363,60.0
...,...,...,...,...,...
22495,22495,166.824935,165.0,35.695065,38.0
22496,22496,124.817362,127.0,70.235626,76.0
22497,22497,119.206284,114.0,48.766393,35.0
22498,22498,86.933761,88.0,28.407051,30.0


In [6]:
def make_xy_agg_mapping(df):
    agg_df = df.groupby("uid")[["x", "y"]].agg(["mean", "median"])
    agg_df.columns = [f"{c[0]}_{c[1]}" for c in agg_df.columns]
    agg_df = agg_df.reset_index()
    return agg_df


mapping_df = make_xy_agg_mapping(task_df)

In [17]:
mapping_df

Unnamed: 0,uid,x_mean,x_median,y_mean,y_median
0,0,153.197119,163.0,66.939348,61.0
1,1,87.556010,84.0,90.565004,88.0
2,2,74.795282,68.0,86.117553,84.0
3,3,109.277736,119.0,98.190189,78.0
4,4,138.720566,137.0,62.939363,60.0
...,...,...,...,...,...
22495,22495,166.824935,165.0,35.695065,38.0
22496,22496,124.817362,127.0,70.235626,76.0
22497,22497,119.206284,114.0,48.766393,35.0
22498,22498,86.933761,88.0,28.407051,30.0


In [19]:
merged_df = pd.merge(
    task_df, mapping_df[["uid", "x_mean", "y_mean"]], on="uid", how="left"
)
merged_df["x_diff"] = merged_df["x"] - merged_df["x_mean"]
merged_df["y_diff"] = merged_df["y"] - merged_df["y_mean"]
merged_df

Unnamed: 0,uid,d,t,x,y,x_diff,x_mean,y_mean,y_diff
0,2381,0,15,158,99,5.373030,152.626970,94.427320,4.572680
1,2381,0,16,167,90,14.373030,152.626970,94.427320,-4.427320
2,2381,0,19,167,88,14.373030,152.626970,94.427320,-6.427320
3,2381,0,20,167,88,14.373030,152.626970,94.427320,-6.427320
4,2381,0,23,168,88,15.373030,152.626970,94.427320,-6.427320
...,...,...,...,...,...,...,...,...,...
26621231,1792,74,42,75,135,-0.556635,75.556635,131.611904,3.388096
26621232,1792,74,43,75,135,-0.556635,75.556635,131.611904,3.388096
26621233,1792,74,45,75,135,-0.556635,75.556635,131.611904,3.388096
26621234,1792,74,46,75,135,-0.556635,75.556635,131.611904,3.388096


In [11]:
def calculate_new_value(row):
    value_from_dict = d.get(row["uid"], None)
    if value_from_dict is not None:
        return row["x"] - value_from_dict
    return None

In [14]:
tqdm.pandas()
d = dict(zip(mapping_df["uid"], mapping_df["x_mean"]))
task_df["x_diff"] = task_df.swifter.apply(calculate_new_value, axis=1)

OSError: [Errno 28] No space left on device

In [10]:
task_df

Unnamed: 0,uid,d,t,x,y,x_diff
0,2381,0,15,158,99,5.37303
1,2381,0,16,167,90,14.37303
2,2381,0,19,167,88,14.37303
3,2381,0,20,167,88,14.37303
4,2381,0,23,168,88,15.37303
...,...,...,...,...,...,...
26621231,1792,74,42,75,135,
26621232,1792,74,43,75,135,
26621233,1792,74,45,75,135,
26621234,1792,74,46,75,135,
