In [1]:
import numpy as np
import polars as pl

train = pl.read_csv("data/train.csv").rename({"": "idx"})
test = pl.read_csv("data/test.csv").rename({"": "idx"})
sample_submission = pl.read_csv("data/sample_submission.csv")

In [2]:
feat = "feat00" # for catboost

# train, testを結合
train_test = pl.concat([train.drop("health"), test])

# problemに対するfeature engineering
problem_lst = ["WiresRope", "Stones", "MetalGrates", "RootOther", "TrunkOther", 
               "BranchLights", "TrunkLights", "BranchOther", "Sneakers"]
for prob in problem_lst:
    train_test = train_test.with_columns(pl.when(pl.col("problems").str.contains(prob)).then(1).otherwise(0).alias(f"problems_contain_{prob}"))
train_test = train_test.drop("problems")

# ordinal encoding
cols_notcat = ['idx', 'created_at', 'tree_dbh']
cols_cat = [c for c in train_test.columns if not c in cols_notcat] # カテゴリ特徴量
for col in cols_cat:
    if "problems_contain_" in col: # 0/1フラグ特徴量はordinal encodingの対象外
        continue
    train_test = train_test.with_columns(pl.col(col).cast(pl.Utf8)) # integerの場合はstringに変換
    train_test = train_test.with_columns(pl.col(col).cast(pl.Categorical).cast(pl.UInt32))

# 多重共線性を排除
cols_drop = ["spc_common", "nta_name", "borocode"]
cols_cat = [c for c in cols_cat if not c in cols_drop]
train_test = train_test.drop(cols_drop)

# "created_at"特徴量を、最も古い日付(15/5/19)からの経過日数に変換
dates = train_test["created_at"].str.to_datetime()
dates = pl.Series((dates - dates.min()).dt.total_days())
train_test = train_test.with_columns(dates.alias("created_at"))

# split train/test
n_train = len(train)

# train feature for multiclass clf
train_feat = train_test.filter(pl.col("idx") < n_train)
train_feat = train_feat.join(train.select(["idx", "health"]), on="idx", how="left")
train_feat.write_csv(f"feat/feat_train_{feat}.csv")

# test feature
test_feat = train_test.filter(pl.col("idx") >= n_train)
test_feat.write_csv(f"feat/feat_test_{feat}.csv")

In [3]:
from target_encoder import TargetEncoder

feat = "feat01" # for lightGBM

# target encodingにむけて2クラス分類用ラベル作成
train = train.with_columns([pl.when(pl.col("health") == h).then(1).otherwise(0).alias(f"health_is_{h}") 
                            for h in range(3)])

# target encodingを適用
cat_df_train = train_feat[cols_cat]
cat_df_test = test_feat[cols_cat]

for health in range(3): # 3クラス分類のため、それぞれのターゲットクラスに基づいてencodingする
    encoder = TargetEncoder()
    tenc_df_train = encoder.fit_transform(cat_df_train, train[f"health_is_{health}"])
    tenc_df_test = encoder.transform(cat_df_test)

    train_feat = pl.concat([train_feat, tenc_df_train], how="horizontal")
    test_feat = pl.concat([test_feat, tenc_df_test], how="horizontal")

# save    
train_feat.write_csv(f"feat/feat_train_{feat}.csv")
test_feat.write_csv(f"feat/feat_test_{feat}.csv")    

In [4]:
feat = "feat02" # for RandomForest

# trainにおける、各クラスのデータ件数率
weights_train = np.array(train["health"].value_counts().sort("health")["count"]) / len(train)

# ターゲットエンコーディングの特徴量の欠損を、trainにおけるデータ件数率で埋める
for h in range(3):
    cols = [c for c in train_feat.columns if f"health_is_{h}" in c]
    train_feat = train_feat.with_columns(train_feat[cols].fill_nan(weights_train[h]))
    test_feat = test_feat.with_columns(test_feat[cols].fill_nan(weights_train[h]))
    
# カテゴリ特徴量の欠損を-1で埋める（カテゴリ特徴量の欠損を受け付けない）
train_feat = train_feat.with_columns(train_feat[cols_cat].fill_null(-1))
test_feat = test_feat.with_columns(test_feat[cols_cat].fill_null(-1))

# save    
train_feat.write_csv(f"feat/feat_train_{feat}.csv")
test_feat.write_csv(f"feat/feat_test_{feat}.csv")    

In [5]:
feat = "feat03" # for LogisticRegression, NN

# Ordinal Encodingの特徴量を削除
train_feat = train_feat.drop(cols_cat)
test_feat = test_feat.drop(cols_cat)

# save    
train_feat.write_csv(f"feat/feat_train_{feat}.csv")
test_feat.write_csv(f"feat/feat_test_{feat}.csv")    