In [1]:
import polars as pl
from sklearn.linear_model import SGDClassifier
from datetime import datetime
import numpy as np
import optuna
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# データ読み込み

In [2]:
df = pl.read_csv("train_data")

In [3]:
df

id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
f64,i64,i64,i64,i64,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
1.0000e18,0,14102100,1005,0,"""1fbe01fe""","""f3845767""","""28905ebd""","""ecad2386""","""7801e8d9""","""07d7df22""","""a99f214a""","""ddd2926e""","""44956a24""",1,2,15706,320,50,1722,0,35,-1,79
1.0016e19,0,14102100,1005,1,"""856e6d3f""","""58a89a43""","""f028772b""","""ecad2386""","""7801e8d9""","""07d7df22""","""a99f214a""","""4375586d""","""5ec45883""",1,0,19772,320,50,2227,0,687,100075,48
1.0029e18,0,14102100,1005,0,"""85f751fd""","""c4e18dd6""","""50e219e0""","""1779deee""","""2347f47a""","""f95efa07""","""a99f214a""","""ab9a5222""","""2ee63ff8""",1,0,20596,320,50,2161,0,35,-1,157
1.0045e19,0,14102100,1005,0,"""85f751fd""","""c4e18dd6""","""50e219e0""","""51cedd4e""","""aefc06bd""","""0f2161f8""","""a99f214a""","""bbe53381""","""542422a7""",1,0,19743,320,50,2264,3,427,100000,61
1.0060e19,0,14102100,1005,0,"""1fbe01fe""","""f3845767""","""28905ebd""","""ecad2386""","""7801e8d9""","""07d7df22""","""a99f214a""","""8a014cbb""","""04f5b394""",1,0,15702,320,50,1722,0,35,-1,79
1.0073e19,0,14102100,1005,1,"""85f751fd""","""c4e18dd6""","""50e219e0""","""13684a79""","""2347f47a""","""0f2161f8""","""0240183e""","""76f576f3""","""be6db1d7""",1,0,20596,320,50,2161,0,35,-1,157
1.0087e19,0,14102100,1005,0,"""85f751fd""","""c4e18dd6""","""50e219e0""","""e2fcccd2""","""5c5a694b""","""0f2161f8""","""4c362c9f""","""42951c8d""","""e981565c""",1,0,20633,320,50,2374,3,39,-1,23
1.0100e19,0,14102100,1002,0,"""34d1d55f""","""97df357a""","""50e219e0""","""ecad2386""","""7801e8d9""","""07d7df22""","""53c581eb""","""bc7b50e0""","""43e7b962""",0,0,20170,300,50,2312,0,167,100075,16
1.0113e19,1,14102100,1005,0,"""1fbe01fe""","""f3845767""","""28905ebd""","""ecad2386""","""7801e8d9""","""07d7df22""","""a99f214a""","""bc7f9471""","""8b1aa260""",1,0,15705,320,50,1722,0,35,-1,79
1.0129e19,0,14102100,1005,0,"""85f751fd""","""c4e18dd6""","""50e219e0""","""2f6efcf2""","""813f3323""","""0f2161f8""","""a99f214a""","""bfe46e76""","""4ea23a13""",1,0,21611,320,50,2480,3,297,100111,61


In [4]:
df.null_count()

id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# 前処理

In [5]:
feature_names = [
    "hour",
    "banner_pos",
    "site_id",
    "site_domain",
    "site_category",
    "app_id",
    "app_domain",
    "app_category",
    "device_id",
    "device_ip",
    "device_model",
    "device_type",
]

target_name = "click"

In [6]:
X = df[feature_names]
y= df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, shuffle=False)

In [7]:
def preprocess(df: pl.dataframe.frame.DataFrame):    
    df = df.with_columns(pl.col("hour").apply(lambda x: datetime.strptime(str(x), "%y%m%d%H"))).rename({"hour": "datetime"})
    hour = (df.get_column("datetime").apply(lambda x: x.hour)).alias("hour")
    weekday = (df.get_column("datetime").apply(lambda x: x.weekday())).alias("weekday")
    df = df.with_columns(hour, weekday)

    feature_hasher = FeatureHasher(n_features=2**18, input_type="string")
    hashed_feature = feature_hasher.fit_transform(np.array(df.select(pl.all().cast(str))))
    return hashed_feature

In [8]:
X_train_preprocessed = preprocess(X_train)
X_valid_preprocessed = preprocess(X_valid)
X_test_preprocessed = preprocess(X_test)

# ハイパラチューニング

In [9]:
def grid_search(X_train, y_train, X_valid, y_valid) -> float:
    best_score = 1e10
    best_alpha = 0.01
    for alpha in [1e-5, 1e-4, 1e-3, 1e2, 1e-1]:
        model = SGDClassifier(loss="log_loss", penalty="l2", random_state=42, alpha=alpha)
        model.fit(X_train, y_train)
        score = model.score(X_valid, y_valid)
        print(f"Grid Search| alpha: {alpha}, score: {score}")

        if score > best_score:
            best_score = score
            best_alpha = alpha
    return best_alpha


# 学習・評価

In [10]:
best_alpha = grid_search(X_train_preprocessed, y_train, X_valid_preprocessed, y_valid)

best_model = SGDClassifier(
    loss="log_loss", penalty="l2", random_state=42, alpha=best_alpha
)
best_model.fit(X_train_preprocessed, y_train)

print("test logloss: {}".format(best_model.score(X_test_preprocessed, y_test)))

Grid Search| alpha: 1e-05, score: 0.8437079176629015
Grid Search| alpha: 0.0001, score: 0.8435430236073322
Grid Search| alpha: 0.001, score: 0.8417566713386649
Grid Search| alpha: 100.0, score: 0.8416467419682854
Grid Search| alpha: 0.1, score: 0.8416467419682854
test logloss: 0.835736723638972
