In [1]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

train = pl.read_csv("data/train.csv").rename({"": "idx"})
test = pl.read_csv("data/test.csv").rename({"": "idx"})
sample_submission = pl.read_csv("data/sample_submission.csv")

In [2]:
feat = "feat00"

# train, testを結合
train_test = pl.concat([train.drop("health"), test])

# 各カラム
cols_cat_int = ["boro_ct", "cb_num"] # integerだがカテゴリ特徴量とみなすもの
cols_num = [c for c in train_test.select(pl.col(pl.INTEGER_DTYPES)).columns if not c in cols_cat_int] # 数値特徴量
cols_cat = [c for c in train_test.select(pl.col(pl.Utf8)).columns if c != "created_at"] + cols_cat_int # カテゴリ特徴量

# cols_cat_intの特徴量については、integerからstringに変換
for col in cols_cat_int:
    train_test = train_test.with_columns(pl.col(col).cast(pl.Utf8))

# ordinal encoding
for col in cols_cat:
    train_test = train_test.with_columns(pl.col(col).cast(pl.Categorical).cast(pl.UInt32))
    
# "created_at"特徴量を、最も古い日付(15/5/19)からの経過日数に変換
dates = train_test["created_at"].str.to_datetime()
dates = pl.Series((dates - dates.min()).dt.total_days())
train_test = train_test.with_columns(dates.alias("created_at"))

# split train/test
n_train = len(train)

# train feature for multiclass clf
train_feat = train_test.filter(pl.col("idx") < n_train)
train_feat = train_feat.join(train.select(["idx", "health"]), on="idx", how="left")
train_feat.write_csv(f"feat/feat_train_{feat}.csv")

# test feature
test_feat = train_test.filter(pl.col("idx") >= n_train)
test_feat.write_csv(f"feat/feat_test_{feat}.csv")

In [3]:
from target_encoder import TargetEncoder

feat = "feat01"

# target encodingにむけて2クラス分類用ラベル作成
train = train.with_columns([pl.when(pl.col("health") == h).then(1).otherwise(0).alias(f"health_is_{h}") 
                            for h in range(3)])

# target encodingを適用
cat_df_train = train_feat[cols_cat]
cat_df_test = test_feat[cols_cat]

for health in range(3): # 3クラス分類のため、それぞれのターゲットクラスに基づいてencodingする
    encoder = TargetEncoder()
    tenc_df_train = encoder.fit_transform(cat_df_train,  train[f"health_is_{health}"])
    tenc_df_test = encoder.transform(cat_df_test)

    train_feat = pl.concat([train_feat, tenc_df_train], how="horizontal")
    test_feat = pl.concat([test_feat, tenc_df_train], how="horizontal")

# save    
train_feat.write_csv(f"feat/feat_train_{feat}.csv")
test_feat.write_csv(f"feat/feat_test_{feat}.csv")    