In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from catboost import CatBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [2]:
X = pd.read_parquet("/kaggle/input/sber-hack/train_ai_comp_final_dp.parquet")

y = X["target"]
X.drop(columns = ["sample_ml_new", "target", "id"], inplace = True)

In [3]:
print(X.shape)
X.tail(3)

(519615, 1076)


Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
693039,1761,1759,0,168,191,1,125,290,298,176,...,0,0,0,0,0,51714,0,0,0,0
693040,1761,1759,141,74,191,1,125,290,298,176,...,0,0,0,0,0,51714,0,0,0,0
693042,1761,1759,141,28,191,0,125,290,298,176,...,0,0,0,0,50250,51714,0,0,0,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=160_032_240)

In [5]:
nan_columns = []
arr = X_train.isna().sum().values
for i in range(len(arr)):
    if arr[i] > 0:
        nan_columns.append("feature" + str(i + 1))

In [6]:
unique_features = []
binary_features = []
categorical_features = []
numeric_features = []
arr = X_train.nunique().values
for i in range(len(arr)):
    if arr[i] == 1:
        unique_features.append("feature" + str(i + 1))
    elif arr[i] == 2:
        binary_features.append("feature" + str(i + 1))
    elif arr[i] < 15:
        categorical_features.append("feature" + str(i + 1))
    else:
        numeric_features.append("feature" + str(i + 1))

In [7]:
colums_to_drop = list(set(nan_columns) | set(unique_features))
X_train.drop(columns = colums_to_drop, inplace = True)
X_test.drop(columns = colums_to_drop, inplace = True)
unique_features = list(set(unique_features) - set(nan_columns))
binary_features = list(set(binary_features) - set(nan_columns))
categorical_features = list(set(categorical_features) - set(nan_columns))

In [8]:
X_train.shape

(415692, 986)

In [9]:
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler()), ("selector", SelectPercentile(f_classif, percentile=40))]
)

binary_transformer = Pipeline(
    steps=[
        ("selector", SelectPercentile(chi2, percentile=40)),
    ]
)
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=40)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("bin", binary_transformer, binary_features)
    ]
)

In [10]:
preprocessor.fit(X_train, y_train)

In [11]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
X_train.shape

(415692, 648)

### Random Forest

In [12]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [13]:
sfm = SelectFromModel(rf_clf, threshold='median')
sfm.fit(X_train, y_train)
X_train_selected = sfm.transform(X_train)
X_test_selected = sfm.transform(X_test)

In [14]:
print(f"Shape of X_train: {X_train_selected.shape}")

Shape of X_train: (415692, 324)


In [15]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components=0.95)
# X_train_pca = pca.fit_transform(X_train)
# X_test_pca = pca.transform(X_test)
# X_train_pca.shape

### CatBoost Classifier

In [16]:
CatBoost_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=3,
    random_state=42,
    verbose=False
)

In [17]:
CatBoost_model.fit(X_train_selected, y_train)
pred_catboost = CatBoost_model.predict_proba(X_test_selected)[:, 1]
pred_catboost_binary = (pred_catboost >= 0.1)

print("CatBoost Metrics:")
print("F1_SCORE:", f1_score(y_test, pred_catboost_binary))
print("PRECISION:", precision_score(y_test, pred_catboost_binary))
print("RECALL:", recall_score(y_test, pred_catboost_binary))
print("ROC_AUC:", roc_auc_score(y_test, pred_catboost))

CatBoost Metrics:
F1_SCORE: 0.19399830938292478
PRECISION: 0.15945805106826472
RECALL: 0.24763960075532776
ROC_AUC: 0.7518237596271417


### XGBoost Classifier

In [18]:
XGB_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    random_state=42
)

In [19]:
XGB_model.fit(X_train_selected, y_train)
pred_xgboost = XGB_model.predict_proba(X_test_selected)[:, 1]
pred_xgboost_binary = (pred_xgboost >= 0.1)
print("\nXGBoost Metrics:")
print("F1_SCORE:", f1_score(y_test, pred_xgboost_binary))
print("PRECISION:", precision_score(y_test, pred_xgboost_binary))
print("RECALL:", recall_score(y_test, pred_xgboost_binary))
print("ROC_AUC:", roc_auc_score(y_test, pred_xgboost))


XGBoost Metrics:
F1_SCORE: 0.1905836496050877
PRECISION: 0.15375703409467065
RECALL: 0.25060695980577286
ROC_AUC: 0.7493914305068681


### LightGBM Classifier 

In [20]:
LGBM_model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    num_leaves=31,
    objective='binary',
    random_state=42
)

In [21]:
LGBM_model.fit(X_train_selected, y_train)
pred_lightgbm = LGBM_model.predict_proba(X_test_selected)[:, 1]
pred_lightgbm_binary = (pred_lightgbm >= 0.1)

print("\nLightGBM Metrics:")
print("F1_SCORE:", f1_score(y_test, pred_lightgbm_binary))
print("PRECISION:", precision_score(y_test, pred_lightgbm_binary))
print("RECALL:", recall_score(y_test, pred_lightgbm_binary))
print("ROC_AUC:", roc_auc_score(y_test, pred_lightgbm))

[LightGBM] [Info] Number of positive: 14830, number of negative: 400862
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.586850 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 49879
[LightGBM] [Info] Number of data points in the train set: 415692, number of used features: 324
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035675 -> initscore=-3.296965
[LightGBM] [Info] Start training from score -3.296965

LightGBM Metrics:
F1_SCORE: 0.19107377795769656
PRECISION: 0.15289925494007126
RECALL: 0.2546533585109253
ROC_AUC: 0.7479539716198444


### Random Forest Classifier

In [22]:
rf_clf.fit(X_train_selected, y_train)
pred_rf = rf_clf.predict_proba(X_test_selected)[:, 1]
pred_rf_binary = (pred_rf >= 0.1)

print("\nRF Metrics:")
print("F1_SCORE:", f1_score(y_test, pred_rf_binary))
print("PRECISION:", precision_score(y_test, pred_rf_binary))
print("RECALL:", recall_score(y_test, pred_rf_binary))
print("ROC_AUC:", roc_auc_score(y_test, pred_rf))


LightGBM Metrics:
F1_SCORE: 0.15651966910670712
PRECISION: 0.10041233964569334
RECALL: 0.3547342864850283
ROC_AUC: 0.6971273207142601


## Submission

Теперь сделаем сабмит
Для этого берем тестовые данные

In [23]:
X_submit = pd.read_parquet("/kaggle/input/sber-hack/test_sber.parquet")
X_submit.drop(columns = ["sample_ml_new", "id"], inplace = True)

In [24]:
print(X_submit.shape)
X_submit.tail(3)

(173433, 1076)


Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
173430,1761,1759,141,74,191,1,125,290,298,176,...,0,0,0,0,0,51714,0,0,0,0
173431,1761,1759,141,107,191,1,125,290,298,176,...,0,0,0,0,0,51714,0,0,0,0
173432,1761,1759,141,23,191,0,125,290,298,176,...,0,0,0,0,50250,51714,0,0,0,0


In [25]:
X_submit.drop(columns = colums_to_drop, inplace = True)
X_submit = preprocessor.transform(X_submit)

In [26]:
print(X_submit.shape)

(173433, 648)


In [27]:
X_submit_selected = sfm.transform(X_submit)

In [28]:
submission = pd.read_csv("/kaggle/input/sber-hack/sample_submission.csv")
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03


### Random Forest predict

In [29]:
pred_rf = rf_clf.predict_proba(X_submit_selected)

pred_rf = pred_rf[:, 1]
pred_rf_binary = (pred_rf >= 0.1).astype(int)

In [30]:
submission["target_prob"] = pred_rf
submission["target_bin"] = pred_rf_binary
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.07
1,4,0,0.04
2,12,0,0.04


In [31]:
submission.to_csv("RF_submission.csv", index=False)

### CatBoost Predict

In [32]:
pred_catboost = CatBoost_model.predict_proba(X_submit_selected)

pred_catboost = pred_catboost[:, 1]
pred_catboost_binary = (pred_catboost >= 0.1).astype(int)

In [33]:
submission["target_prob"] = pred_catboost
submission["target_bin"] = pred_catboost_binary
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.017315
1,4,0,0.017263
2,12,0,0.032998


In [34]:
submission.to_csv("CatBoost_submission.csv", index=False)

### XGBoost Predict

In [35]:
pred_xgb = XGB_model.predict_proba(X_submit_selected)

pred_xgb = pred_xgb[:, 1]
pred_xgb_binary = (pred_xgb >= 0.1).astype(int)
submission["target_prob"] = pred_xgb
submission["target_bin"] = pred_xgb_binary
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.012358
1,4,0,0.012105
2,12,0,0.026978


In [36]:
submission.to_csv("XGBoost_submission.csv", index=False)

### LightGB Predict

In [37]:
pred_lgbm = LGBM_model.predict_proba(X_submit_selected)

pred_lgbm = pred_lgbm[:, 1]
pred_lgbm_binary = (pred_lgbm >= 0.1).astype(int)
submission["target_prob"] = pred_lgbm
submission["target_bin"] = pred_lgbm_binary
submission.head(3)



Unnamed: 0,id,target_bin,target_prob
0,3,0,0.017784
1,4,0,0.017033
2,12,0,0.033215


In [38]:
submission.to_csv("LGBMBoost_submission.csv", index=False)

### Ensemble predict

In [39]:
pred_avg = (pred_catboost + pred_xgb + pred_lgbm) / 3

pred_avg_binary = (pred_avg >= 0.1).astype(int)
submission["target_prob"] = pred_avg
submission["target_bin"] = pred_avg_binary
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.015819
1,4,0,0.015467
2,12,0,0.031064


In [40]:
submission.to_csv("AVG_submission.csv", index=False)

In [41]:
pred_avg_second = (pred_rf + pred_catboost + pred_xgb + pred_lgbm) / 4

pred_avg_second_binary = (pred_avg_second >= 0.1).astype(int)
submission["target_prob"] = pred_avg_second
submission["target_bin"] = pred_avg_second_binary
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.029364
1,4,0,0.0216
2,12,0,0.033298


In [42]:
submission.to_csv("AVG_SECOND_submission.csv", index=False)

### Average between models

In [43]:
pred_cat_xgb = (pred_catboost + pred_xgb) / 2

pred_cat_xgb_binary = (pred_cat_xgb >= 0.1).astype(int)
submission["target_prob"] = pred_cat_xgb
submission["target_bin"] = pred_cat_xgb_binary
submission.to_csv("Cat_XGB_submission.csv", index=False)
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.014836
1,4,0,0.014684
2,12,0,0.029988


In [44]:
pred_cat_lgbm = (pred_catboost + pred_lgbm) / 2

pred_cat_lgbm_binary = (pred_cat_lgbm >= 0.1).astype(int)
submission["target_prob"] = pred_cat_lgbm
submission["target_bin"] = pred_cat_lgbm_binary
submission.to_csv("Cat_LGBM_submission.csv", index=False)
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.017549
1,4,0,0.017148
2,12,0,0.033106


In [45]:
pred_xgb_lgbm = (pred_xgb + pred_lgbm) / 2

pred_xgb_lgbm_binary = (pred_xgb_lgbm >= 0.1).astype(int)
submission["target_prob"] = pred_xgb_lgbm
submission["target_bin"] = pred_xgb_lgbm_binary
submission.to_csv("XGB_LGBM_submission.csv", index=False)
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.015071
1,4,0,0.014569
2,12,0,0.030097
