Загрузка библиотек

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, average_precision_score
import optuna
import warnings
warnings.filterwarnings('ignore')

Загрузка данных

In [None]:
train = pd.read_csv('data/train_c.csv')
test = pd.read_csv('data/test_c.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
train.head()

Анализ пропусков

In [None]:
print('Пропуски в train:')
print(train.isnull().sum()[train.isnull().sum() > 0])
print('\nПропуски в test:')
print(test.isnull().sum()[test.isnull().sum() > 0])

Обработка данных

In [None]:
def preprocess_data(df, is_train=True):
    df = df.copy()
    
    if 'ApplicationDate' in df.columns:
        df['ApplicationDate'] = pd.to_datetime(df['ApplicationDate'])
        df['Year'] = df['ApplicationDate'].dt.year
        df['Month'] = df['ApplicationDate'].dt.month
        df = df.drop('ApplicationDate', axis=1)
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col].fillna(df[col].median(), inplace=True)
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

train_processed = preprocess_data(train, is_train=True)
test_processed = preprocess_data(test, is_train=False)

Кодирование категориальных признаков

In [None]:
categorical_features = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus', 'EducationLevel']

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    train_processed[col] = le.fit_transform(train_processed[col].astype(str))
    test_processed[col] = le.transform(test_processed[col].astype(str))
    label_encoders[col] = le

Разделение на признаки и целевую переменную

In [None]:
X = train_processed.drop('LoanApproved', axis=1)
y = train_processed['LoanApproved']
X_test_final = test_processed.drop('ID', axis=1)
test_ids = test_processed['ID']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Масштабирование

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_final)

Оптимизация гиперпараметров для Gradient Boosting

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'random_state': 42
    }
    
    model = GradientBoostingClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, preds)
    
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print(f'Лучший ROC-AUC: {study.best_value:.4f}')
print(f'Лучшие параметры: {study.best_params}')

Обучение финальной модели

In [None]:
best_model = GradientBoostingClassifier(**study.best_params, random_state=42)
best_model.fit(X, y)

val_preds_proba = best_model.predict_proba(X_val)[:, 1]
val_preds_binary = best_model.predict(X_val)

val_roc_auc = roc_auc_score(y_val, val_preds_proba)
val_precision = precision_score(y_val, val_preds_binary)
val_recall = recall_score(y_val, val_preds_binary)
val_f1 = f1_score(y_val, val_preds_binary)
val_pr_auc = average_precision_score(y_val, val_preds_proba)

print(f'Validation ROC-AUC: {val_roc_auc:.4f}')
print(f'Validation Precision: {val_precision:.4f}')
print(f'Validation Recall: {val_recall:.4f}')
print(f'Validation F1-score: {val_f1:.4f}')
print(f'Validation PR-AUC: {val_pr_auc:.4f}')

Предсказание на тестовой выборке

In [None]:
test_predictions = best_model.predict(X_test_final)

submission = pd.DataFrame({
    'ID': test_ids,
    'LoanApproved': test_predictions
})

submission.to_csv('ex_c.csv', index=False)
print('Файл ex_c.csv создан')
submission.head(10)