In [1]:
# !pip install scikit-learn
# !pip install matplotlib
# !pip install optuna
# !pip install imblearn
# !pip install pandas
# !pip install numpy
# !pip install seaborn
# !pip install lightgbm
# !pip install tqdm
# !pip install ipywidgets

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import optuna
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from encoding import EncodingCatFeatures, EncodingNumFeatures
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
import optuna.visualization as vis

In [3]:
data = pd.read_csv('train.csv')
X, y = data.drop(columns=['Survived', 'Name']), data['Survived']

In [4]:
def objective(trial):

    # encoder definition and params
    cat_encoder_params = {
        'threshold': trial.suggest_int('threshold', 1, 50),
        'small_groups_threshold': trial.suggest_float('small_groups_threshold', 0.01, 0.5)
    }
    num_encoder_params = {
        'threshold': trial.suggest_float('num_threshold', 0.01, 1.0)
    }
    cat_encoder = EncodingCatFeatures(**cat_encoder_params)
    num_encoder = EncodingNumFeatures(**num_encoder_params)



    # final estimator definition and params
    svc_params = {
        'C': trial.suggest_float('svc_C', 1e-3, 1e2, log=True),
        'kernel': trial.suggest_categorical('svc_kernel', ['linear', 'rbf']),
    }


    # params func for lgbm
    def get_lgb_params(trial, prefix=''):
        return {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'learning_rate': trial.suggest_float(f'{prefix}learning_rate', 0.001, 0.1, log=True),
            'num_leaves': trial.suggest_int(f'{prefix}num_leaves', 2, 512),
            'max_depth': trial.suggest_int(f'{prefix}max_depth', 1, 100),
            'lambda_l1': trial.suggest_float(f'{prefix}lambda_l1', 1e-5, 1.0, log=True),
            'min_child_samples': trial.suggest_int(f'{prefix}min_child_samples', 2, 100),
            'feature_fraction': trial.suggest_float(f'{prefix}feature_fraction', 0.1, 1.0),
            'bagging_fraction': trial.suggest_float(f'{prefix}bagging_fraction', 0.1, 1.0),
            'bagging_freq': trial.suggest_int(f'{prefix}bagging_freq', 1, 10),
            'min_split_gain': trial.suggest_float(f'{prefix}min_split_gain', 0.0, 1.0),
            'max_bin': trial.suggest_int(f'{prefix}max_bin', 100, 500),
            'verbose': -1,
        }
    
    # definition of stacking
    estimators = []
    for i in range(4):
        lgb_params = get_lgb_params(trial, prefix=f'lgbm{i}_')
        estimators.append((f'lgbm{i}', LGBMClassifier(**lgb_params)))
    final_estimator = SVC(**svc_params)
    stacking = StackingClassifier(estimators=estimators, final_estimator=final_estimator)


    # making pipeline for data preproccess and main pipeline
    data_prep = Pipeline([
        ('cat_encoder', cat_encoder),
        ('num_encoder', num_encoder)
    ])

    main_pipeline = Pipeline([
        ('data_preprocess', data_prep),
        ('stacking', stacking)
    ])

    
    # cross validation
    scores = cross_val_score(main_pipeline, X, y, cv=3, n_jobs=-1, scoring='accuracy')
    return np.mean(scores)

In [5]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)

[I 2025-06-11 23:49:52,125] A new study created in memory with name: no-name-8d2ded8b-2621-4fe2-a6e0-34effe57f18f
[I 2025-06-11 23:49:56,153] Trial 0 finished with value: 0.6835016835016834 and parameters: {'threshold': 15, 'small_groups_threshold': 0.09121204229789245, 'num_threshold': 0.5800108747051596, 'svc_C': 0.010406298554018596, 'svc_kernel': 'linear', 'lgbm0_learning_rate': 0.0012869258515623822, 'lgbm0_num_leaves': 404, 'lgbm0_max_depth': 5, 'lgbm0_lambda_l1': 0.004846329008202361, 'lgbm0_min_child_samples': 81, 'lgbm0_feature_fraction': 0.6157237538801352, 'lgbm0_bagging_fraction': 0.48867999914729043, 'lgbm0_bagging_freq': 9, 'lgbm0_min_split_gain': 0.14302439560868574, 'lgbm0_max_bin': 354, 'lgbm1_learning_rate': 0.004521103414228234, 'lgbm1_num_leaves': 263, 'lgbm1_max_depth': 22, 'lgbm1_lambda_l1': 0.0007135409512923443, 'lgbm1_min_child_samples': 22, 'lgbm1_feature_fraction': 0.8533284602252067, 'lgbm1_bagging_fraction': 0.5233159631990248, 'lgbm1_bagging_freq': 2, 'lgb

In [6]:
vis.plot_optimization_history(study).show()
vis.plot_param_importances(study).show()

In [7]:
X_test = pd.read_csv('test.csv')
X_test.drop(columns=['Name'], inplace = True)

In [8]:
best_params = study.best_trial.params
# encoders
cat_encoder_params = {
    'threshold': best_params['threshold'],
    'small_groups_threshold': best_params['small_groups_threshold']
}
num_encoder_params = {
    'threshold': best_params['num_threshold']

}
cat_encoder = EncodingCatFeatures(**cat_encoder_params)
num_encoder = EncodingNumFeatures(**num_encoder_params)

# final estimator
svc_params = {
    'C': best_params['svc_C'],
    'kernel': best_params['svc_kernel']
}
final_estimator = SVC(**svc_params)

# LGBM
estimators = []
for i in range(3):
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'learning_rate': best_params[f'lgbm{i}_learning_rate'],
        'num_leaves': best_params[f'lgbm{i}_num_leaves'],
        'max_depth': best_params[f'lgbm{i}_max_depth'],
        'lambda_l1': best_params[f'lgbm{i}_lambda_l1'],
        'min_child_samples': best_params[f'lgbm{i}_min_child_samples'],
        'feature_fraction': best_params[f'lgbm{i}_feature_fraction'],
        'bagging_fraction': best_params[f'lgbm{i}_bagging_fraction'],
        'bagging_freq': best_params[f'lgbm{i}_bagging_freq'],
        'min_split_gain': best_params[f'lgbm{i}_min_split_gain'],
        'max_bin': best_params[f'lgbm{i}_max_bin'],
        'verbose': -1
    }
    estimators.append((f'lgbm{i}', LGBMClassifier(**lgb_params)))

# stacking
stacking = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

# data preporation
data_prep = Pipeline([
    ('cat_encoder', cat_encoder),
    ('num_encoder', num_encoder)
])

# final pipeline
final_pipeline = Pipeline([
    ('data_preprocess', data_prep),
    ('stacking', stacking)
])

# fit and predict
final_pipeline.fit(X, y)
y_pred = final_pipeline.predict(X_test)

In [9]:
data_prep.fit(X, y)
data_prep.transform(X_test)

Unnamed: 0,PassengerId,Pclass,SibSp,Parch,Ticket,Fare,Cabin,logSex,logAge,logEmbarked
0,892,3,0,0,0,7.8292,-1,0.173035,3.569533,0.329023
1,893,3,1,0,1,7.0000,-1,0.555056,3.871201,0.290396
2,894,2,0,0,2,9.6875,-1,0.173035,4.143135,0.329023
3,895,3,0,0,3,8.6625,-1,0.173035,3.332205,0.290396
4,896,3,1,1,4,12.2875,-1,0.555056,3.135494,0.290396
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,0,0,358,8.0500,-1,0.173035,3.367296,0.290396
414,1306,1,0,0,359,108.9000,75,0.555056,3.688879,0.440556
415,1307,3,0,0,360,7.2500,-1,0.173035,3.676301,0.290396
416,1308,3,0,0,361,8.0500,-1,0.173035,3.367296,0.290396


In [10]:
submission = pd.DataFrame({
    'PassengerId': X_test['PassengerId'],  # обязательно должна быть в X_test
    'Survived': y_pred
})

submission.to_csv('submission.csv', index=False)