In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
import pyarrow.parquet as pq

file_path = 'train_ai_comp_final_dp.parquet'
parquet_file = pq.ParquetFile(file_path)
table = parquet_file.read()
df = table.to_pandas()

df = df.head(519000)


In [19]:
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

feature642    314826
feature756    519000
dtype: int64

In [20]:
df_cleaned=df

df_cleaned.drop('feature756', axis=1, inplace=True)

df_cleaned['feature642'].fillna(1, inplace=True)

print(df_cleaned['feature642'])

0         0.0
1         0.0
2         1.0
5         1.0
6         1.0
         ... 
692208    1.0
692210    0.0
692212    1.0
692214    1.0
692215    0.0
Name: feature642, Length: 519000, dtype: float64


In [21]:
threshold = 0.7  


cols_to_drop = [col for col in df_cleaned.columns if col != 'target' and (df_cleaned[col] == 0).mean() > threshold]


df_cleaned = df_cleaned.drop(columns=cols_to_drop)

In [22]:
df_cleaned

Unnamed: 0,id,target,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature8,...,feature997,feature998,feature999,feature1000,feature1001,feature1002,feature1003,feature1004,feature1069,feature1076
0,0,0,1,1761,1759,85,105469,191,46,6,...,0,0,2,1,2,3,6,14,84264,84264
1,1,0,1,1761,1759,120,105610,144,71,135,...,0,0,3,0,2,3,9,22,0,0
2,2,0,1,890,1759,141,105227,191,11,0,...,1,2,3,3,5,8,23,40,113317,113317
5,5,0,1,1599,966,30,102441,191,8,0,...,1,1,2,2,3,3,3,3,8530,8530
6,6,0,1,1761,1759,85,104006,191,2,0,...,1,0,3,1,3,4,10,12,7642,7642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692208,692208,1,1,1660,1,101,104764,191,17,0,...,0,0,2,0,1,1,3,9,0,0
692210,692210,0,1,700,700,77,102153,120,7,290,...,0,0,1,0,0,1,4,7,22169,22169
692212,692212,0,1,690,533,98,103814,191,20,0,...,0,4,2,4,5,6,13,27,0,0
692214,692214,0,1,1664,1759,121,104726,191,5,0,...,0,1,3,1,3,2,9,22,4122,4122


In [23]:
df_cleaned.drop('id', axis=1, inplace=True)
df_cleaned.drop('sample_ml_new', axis=1, inplace=True)

In [24]:
df_cleaned.columns

Index(['target', 'feature1', 'feature2', 'feature3', 'feature4', 'feature5',
       'feature6', 'feature8', 'feature9', 'feature12',
       ...
       'feature997', 'feature998', 'feature999', 'feature1000', 'feature1001',
       'feature1002', 'feature1003', 'feature1004', 'feature1069',
       'feature1076'],
      dtype='object', length=795)

In [25]:
from sklearn.model_selection import train_test_split


X = df_cleaned.drop('target', axis=1)
y = df_cleaned['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [26]:
from catboost import CatBoostClassifier

# Определение параметров модели CatBoost
cat_params = {
    'iterations': 3400,
    'learning_rate': 0.01,
    'depth': 6,
    'eval_metric': 'AUC',  
    'random_seed': 42,
    'l2_leaf_reg': 2.2, 
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'auto_class_weights': 'Balanced',
    'verbose': 200,
    'loss_function': 'Logloss'  
}

cat_model = CatBoostClassifier(**cat_params)
cat_model.fit(X_train, y_train)


0:	total: 610ms	remaining: 34m 33s
200:	total: 2m 34s	remaining: 40m 51s
400:	total: 4m 7s	remaining: 30m 48s
600:	total: 5m 25s	remaining: 25m 17s
800:	total: 6m 49s	remaining: 22m 7s


In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, classification_report
y_pred_cat = cat_model.predict(X_test)
y_pred_proba_cat = cat_model.predict_proba(X_test)[:, 1]

print("Classification Report (CatBoost):")
print(classification_report(y_test, y_pred_cat))
print("ROC-AUC Score (CatBoost):", roc_auc_score(y_test, y_pred_proba_cat))

Classification Report (CatBoost):
              precision    recall  f1-score   support

           0       0.98      0.73      0.84    100104
           1       0.08      0.65      0.15      3696

    accuracy                           0.73    103800
   macro avg       0.53      0.69      0.49    103800
weighted avg       0.95      0.73      0.81    103800

ROC-AUC Score (CatBoost): 0.7599139049068623


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import RandomOverSampler


classifiers = {
    'XGBoost': XGBClassifier(),
    'Logistic Regression': LogisticRegression(),
    'CatBoost': CatBoostClassifier(),
    'Random Forest': RandomForestClassifier()
}


params = {
    'XGBoost': {
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 5]
    },
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10]
    },
    'CatBoost': {
        'classifier__iterations': [1000, 2000],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__depth': [3, 5]
    },
    'Random Forest': {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 5, 10]
    }
}


pipelines = {}
for name, classifier in classifiers.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('sampler', RandomOverSampler()),
        ('classifier', classifier)
    ])
    pipelines[name] = pipeline


best_models = {}
for name, pipeline in pipelines.items():
    print(f"Searching best parameters for {name}...")
    grid_search = GridSearchCV(pipeline, params[name], cv=5, scoring='roc_auc', verbose=2)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")


for name, model in best_models.items():
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    print(f"Classification Report ({name}):")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score ({name}):", roc_auc_score(y_test, y_pred_proba))


сабмит

In [None]:
X = pd.read_parquet("test_sber.parquet")


In [None]:
pred = cat_model.predict_proba(X)
pred = pred[:, 1]
pred_binary = (pred >= 0.1).astype(int)


In [None]:
submission = pd.read_csv("sample_submission.csv")
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03


In [None]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(3)

Unnamed: 0,id,target_bin,target_prob
0,3,1,0.231862
1,4,1,0.263742
2,12,1,0.34735


In [None]:
submission.to_csv("submission.csv", index=False)