Метрика - ROC-AUC (Receiver Operating Characteristic - Area Under Curve).

Предобработка: Используйте one-hot encoding или hashing для категориальных признаков. Учитывайте высокую кардинальность некоторых столбцов.
Базовый baseline: Логистическая регрессия или SGDClassifier (scikit-learn).
Оптимизация: Попробуйте градиентный бустинг (LightGBM) или нейронные сети для повышения ROC-AUC.
Кросс-валидация: Используйте 5-fold CV для оценки модели на обучающей выборке.

In [9]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb 
from sklearn.metrics import roc_auc_score
# import warnings
# warnings.importwarnings('ignore')

In [10]:
train_path = ''
test_path = '' 
chunksize = 1_000_000
categorical_features = [f'ID_{i:02d}' for i in range(1,23)]

In [11]:
dtypes = {'click': 'int8'}
for col in categorical_features + ['id']:
    dtypes[col] = 'category'

In [None]:
def calculate_smooth_mean(df, by, on, m=100):
    mean = df.groupby(by)[on].mean()
    count = df.groupby(by)[on].count()
    global_mean = df[on].mean()
    smooth = (count*mean + m*global_mean) / (count + m) 
    return smooth

In [None]:
stats = {}
for col in categorical_features:
    stats[col] = {}

for i, chunk in enumerate(pd.read_csv(train_path, chunksize=chunksize, dtype=dtypes)):
    for col in categorical_features:
        group = chunk.groupby(col)['click'].agg(['sum', 'count'])
        for cat_val, (s, c) in group.iterrows():
            if cat_val not in stats[col]:
                stats[col][cat_val] = [s, c]
            else:
                stats[col][cat_val][0] += s
                stats[col][cat_val][1] += c

In [None]:
encodings = {}
global_click_mean = None
total_clicks = 0
total_count = 0

for col in stats:
    for cat_val, (s, c) in stats[col].items():
        total_clicks += s
        total_count += c 
global_click_mean = total_clicks / total_count 

In [None]:
m = 100

for col in categorical_features:
    encodings[col] = {}
    for cat_val, (s, c) in stats[col].items():
        smooth_mean = (s + m*global_click_mean) / (c + m)
        encodings[col][cat_val] = smooth_mean
    encodings[col]['_unknown_'] = global_click_mean

In [None]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'verbosity': -1,
    'seed': 42,
}

model = None
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, chunk in enumerate(pd.read_csv(train_path, chunksize=chunksize, dtype=dtypes)):
    X_chunk = pd.DataFrame()
    for col in categorical_features:
        X_chunk[col] = chunk[col].map(encodings[col]).fillna(globak_click_mean).astype('float32')
    y_chunk = chunk['click'].astype['int8']

    if model is None:
        model = lgb.LGBMClassifier(**params, n_estimators=1000)
        model.fit(X_chunk, y_chunk)
    else:
        model = lgb.LGBMClassifier(**params, n_estimators=100)
        model.fit(X_chunk, y_chunk, init_model=model)

In [None]:
test_chunks = pd.read_csv(test_path, chunksize=chunksize, dtype={col: 'category' for col in categorical_features + ['id']})
predictions = []

for i, chunk in enumerate(test_chunks):
    X_test = pd.DataFrame()
    for col in categorical_features:
        X_test[col] = chunk[col].map(encodings[col]).fillna(globak_click_mean).astype('float32')
    
    chunk_preds = model.predict_proba(X_test)[:, 1]
    
    results = pd.DataFrame({'id': chunk['id'], 'click': chunk_preds})
    predictions.append(results)

final_predictions = pd.concat(predictions, ignore_index=True)

In [None]:
final_predictions.to_csv('my_submission.csv', index=False)