In [1]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

train = pl.read_csv("data/train.csv").rename({"": "idx"})
test = pl.read_csv("data/test.csv").rename({"": "idx"})
sample_submission = pl.read_csv("data/sample_submission.csv")

In [2]:
feat = "feat00"

# train, testを結合
train_test = pl.concat([train.drop("health"), test])

# 各カラム
cols_cat_int = ["boro_ct", "cb_num"] # integerだがカテゴリ特徴量とみなすもの
cols_num = [c for c in train_test.select(pl.col(pl.INTEGER_DTYPES)).columns if not c in cols_cat_int] # 数値特徴量
cols_cat = [c for c in train_test.select(pl.col(pl.Utf8)).columns if c != "created_at"] + cols_cat_int # カテゴリ特徴量

# cols_cat_intの特徴量については、integerからstringに変換
for col in cols_cat_int:
    train_test = train_test.with_columns(pl.col(col).cast(pl.Utf8))

# ordinal encoding
for col in cols_cat:
    train_test = train_test.with_columns(pl.col(col).cast(pl.Categorical).cast(pl.UInt32))
    
# "created_at"特徴量を、最も古い日付(15/5/19)からの経過日数に変換
dates = train_test["created_at"].str.to_datetime()
dates = pl.Series((dates - dates.min()).dt.total_days())
train_test = train_test.with_columns(dates.alias("created_at"))

# split train/test
n_train = len(train)

# train feature for multiclass clf
train_feat = train_test.filter(pl.col("idx") < n_train)
train_feat = train_feat.join(train.select(["idx", "health"]), on="idx", how="left")
train_feat.write_csv(f"feat/feat_train_{feat}.csv")

# test feature
test_feat = train_test.filter(pl.col("idx") >= n_train)
test_feat.write_csv(f"feat/feat_test_{feat}.csv")

In [7]:
from sklearn.model_selection import StratifiedKFold

In [15]:
feat = "feat01"

# target encodingにむけて2クラス分類用ラベル作成
train = train.with_columns([pl.when(pl.col("health") == h).then(1).otherwise(0).alias(f"health_is_{h}") 
                            for h in range(3)])

col = cols_cat[0]

# 層化K-fold
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for fold, (idx_obj, idx_feat) in enumerate(skf.split(train, train["health"])):
    pass

df = 

# 学習データ全体で各カテゴリにおけるtargetの平均を計算


curb_loc
u32
0
0
0
0
0
0
0
0
0
0


### target encoding

In [173]:
from sklearn.model_selection import KFold

cat_df_train = train_feat[cols_cat]
target_train = train["health_is_0"]
cat_df_test = test_feat[cols_cat]

tenc_df_train = pl.DataFrame({"idx": np.arange(len(train))})
tenc_df_test = pl.DataFrame()

# カテゴリ値のため全てstringへ変換
cat_df_train = cat_df_train.cast(pl.Utf8)
cat_df_test = cat_df_test.cast(pl.Utf8)

# target_trainはpl.Seriesでなければならない
assert isinstance(target_train, pl.series.series.Series)

# 1項目ずつ実行
for col in cat_df_train.columns:
    
    # 学習データ全体で各カテゴリにおけるtargetの平均を計算
    col_target = target_train.name
    xy_train = pl.concat([cat_df_train, target_train.to_frame()], how="horizontal")
    target_mean = xy_train.group_by(col).mean()[[col, col_target]]
    mapping = dict(target_mean.to_numpy())
    mapping[None] = np.nan # 欠損値はencodingしても欠損値

    # Target Encoding (test)
    # mappingのkeyに含まれていない場合は欠損値とする
    tenc_test = cat_df_test[col].cast(pl.Float64).replace(mapping, default=np.nan)
    tenc_test = tenc_test.rename(f"{col}_tenc_{col_target}")
    tenc_df_test = tenc_df_test.with_columns(tenc_test)

    # 学習データを分割
    tenc_df_oof_lst = []
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    for idx_obj, idx_enc in kf.split(xy_train):
        
        # out-of-foldで各カテゴリにおける目的変数の平均を計算
        target_mean = xy_train[idx_obj].group_by(col).mean()[[col, col_target]]
        mapping = dict(target_mean.to_numpy())
        mapping[None] = np.nan # 欠損値はencodingしても欠損値

        # Target Encoding (train, out-of-fold)
        # mappingのkeyに含まれていない場合は欠損値とする
        tenc_oof = xy_train[idx_enc, col].cast(pl.Float64).replace(mapping, default=np.nan)
        tenc_oof = tenc_oof.rename(f"{col}_tenc_{col_target}")
        tenc_df_oof = pl.DataFrame({"idx": idx_enc}).with_columns(tenc_oof)
        tenc_df_oof_lst.append(tenc_df_oof)

    # 各foldの結果を集約
    tenc_df_train = tenc_df_train.join(pl.concat(tenc_df_oof_lst), on="idx", how="left")

tenc_df_train = tenc_df_train.drop("idx")


In [None]:
# from sklearn.model_selection import KFold

# cat_df_train = train_feat[cols_cat]
# target_train = train["health_is_0"]
# cat_df_test = test_feat[cols_cat]

# class HoldoutTargetEncoding:
#     def __init__(self):
#         pass

# tenc_df_train = pl.DataFrame({"idx": np.arange(len(train))})
# tenc_df_test = pl.DataFrame()

# # カテゴリ値のため全てstringへ変換
# cat_df_train = cat_df_train.cast(pl.Utf8)
# cat_df_test = cat_df_test.cast(pl.Utf8)

# # target_trainはpl.Seriesでなければならない
# assert isinstance(target_train, pl.series.series.Series)

# # 1項目ずつ実行
# for col in cat_df_train.columns:
    
#     # 学習データ全体で各カテゴリにおけるtargetの平均を計算
#     col_target = target_train.name
#     xy_train = pl.concat([cat_df_train, target_train.to_frame()], how="horizontal")
#     target_mean = xy_train.group_by(col).mean()[[col, col_target]]
#     mapping = dict(target_mean.to_numpy())
#     mapping[None] = np.nan # 欠損値はencodingしても欠損値

#     # Target Encoding (test)
#     # mappingのkeyに含まれていない場合は欠損値とする
#     tenc_test = cat_df_test[col].cast(pl.Float64).replace(mapping, default=np.nan)
#     tenc_test = tenc_test.rename(f"{col}_tenc_{col_target}")
#     tenc_df_test = tenc_df_test.with_columns(tenc_test)

#     # 学習データを分割
#     tenc_df_oof_lst = []
#     kf = KFold(n_splits=5, shuffle=True, random_state=0)
#     for idx_obj, idx_enc in kf.split(xy_train):
        
#         # out-of-foldで各カテゴリにおける目的変数の平均を計算
#         target_mean = xy_train[idx_obj].group_by(col).mean()[[col, col_target]]
#         mapping = dict(target_mean.to_numpy())
#         mapping[None] = np.nan # 欠損値はencodingしても欠損値

#         # Target Encoding (train, out-of-fold)
#         # mappingのkeyに含まれていない場合は欠損値とする
#         tenc_oof = xy_train[idx_enc, col].cast(pl.Float64).replace(mapping, default=np.nan)
#         tenc_oof = tenc_oof.rename(f"{col}_tenc_{col_target}")
#         tenc_df_oof = pl.DataFrame({"idx": idx_enc}).with_columns(tenc_oof)
#         tenc_df_oof_lst.append(tenc_df_oof)

#     # 各foldの結果を集約
#     tenc_df_train = tenc_df_train.join(pl.concat(tenc_df_oof_lst), on="idx", how="left")

# tenc_df_train = tenc_df_train.drop("idx")


In [174]:

tenc_df_train.write_csv("tenc_df_train.csv")
tenc_df_test.write_csv("tenc_df_test.csv")

In [175]:
train.write_csv("train_tmp.csv")