In [1]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time

import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
# from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from optuna import integration, logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from pytorch_tabnet.tab_model import TabNetClassifier
import torch

In [2]:
feat = "feat01"
train = pl.read_csv(f"feat/feat_train_{feat}.csv")
test = pl.read_csv(f"feat/feat_test_{feat}.csv")
train_origin = pl.read_csv("data/train.csv").rename({"": "idx"})

# cols_exp = [c for c in test.columns if c != "idx"]
cols_exp = [c for c in test.columns if "health_is_" in c] # 線形モデル使用のため、ターゲットエンコーディングの特徴量のみ残す

# trainにおける、各クラスのデータ件数率
weights_train = np.array(train_origin["health"].value_counts().sort("health")["count"]) / len(train_origin)

# カテゴリ特徴量のカラム
# cols_cat_int = ["boro_ct", "cb_num"] # integerだがカテゴリ特徴量とみなすもの
# cols_cat = [c for c in train_origin.select(pl.col(pl.Utf8)).columns if c != "created_at"] + cols_cat_int # カテゴリ特徴量

# # カテゴリ特徴量の欠損を-1で埋める（カテゴリ特徴量の欠損を受け付けない）
# train = train.with_columns(train[cols_cat].fill_null(-1))
# test = test.with_columns(test[cols_cat].fill_null(-1))

# ターゲットエンコーディングの特徴量の欠損を、trainにおけるデータ件数率で埋める
for h in range(3):
    cols = [c for c in train.columns if f"health_is_{h}" in c]
    train = train.with_columns(train[cols].fill_nan(weights_train[h]))
    test = test.with_columns(test[cols].fill_nan(weights_train[h]))

### Logistic Regression

In [3]:
def train_logistic_regression(train, cols_exp, col_target, params=None):
    
    if params is None:
        params = {}
        
    params_add = {}
    params |= params_add

    x = train[cols_exp].to_numpy()
    y = train[col_target].to_numpy()
    
    # K-fold
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    y_valid_pred_lst = []
    idx_valid_lst = []
    clf_lst = []

    # cross validation
    for fold, (idx_train, idx_valid) in enumerate(kf.split(x)):
        print("fold", fold)
        x_train = x[idx_train, :]
        x_valid = x[idx_valid, :]
        y_train = y[idx_train]
        y_valid = y[idx_valid]
        
        # normalization
        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        x_valid = scaler.transform(x_valid)

        # modeling
        clf = LogisticRegression(random_state=0, max_iter=1000).fit(x_train, y_train)

        # oof
        y_valid_pred = clf.predict_proba(x_valid)
        y_valid_pred_lst.append(y_valid_pred)
        idx_valid_lst.append(idx_valid)
        clf_lst.append(clf)

    idx_valid = np.hstack(idx_valid_lst)
    # y_valid_pred = np.hstack(y_valid_pred_lst)
    y_valid_pred = np.vstack(y_valid_pred_lst)
    oof_pred = y_valid_pred[np.argsort(idx_valid)]

    return clf_lst, oof_pred

In [4]:
def predict_test(x_test, clf_lst):
    y_test_pred_lst = []

    for clf in clf_lst:
        y_test_pred = clf.predict_proba(x_test)
        y_test_pred_lst.append(y_test_pred)

    y_test_pred = np.mean(y_test_pred_lst, axis=0)
    return y_test_pred

In [5]:

col_target = "health"
print("col_target =", col_target, "-"*50)

# train logistic regression
clf_lst, oof_pred = train_logistic_regression(train, cols_exp, col_target)

# normalization for test
scaler = StandardScaler()
scaler.fit(train[cols_exp].to_numpy())
x_test = scaler.transform(test[cols_exp].to_numpy())

# predict test with CV ensemble
y_test_pred = predict_test(x_test, clf_lst)

# record
oof_pred_df = pl.DataFrame(oof_pred, schema=[f"health_is_{h}" for h in range(3)])
test_pred_df = pl.DataFrame(y_test_pred, schema=[f"health_is_{h}" for h in range(3)])

col_target = health --------------------------------------------------
fold 0
fold 1
fold 2
fold 3
fold 4


In [6]:
# save
oof_pred_df.write_csv(f"pred/oof_pred_logisticregression_{feat}.csv")
test_pred_df.write_csv(f"pred/test_pred_logisticregression_{feat}.csv")