## Basic model

In [52]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter

df = pd.read_csv('train.csv')

# important_features = columns = [col for col in df.columns if col not in ['feature1', 'feature8', 'target']]
# print(important_features)
X = df.drop('target', axis=1)
y = df['target']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

imputer = SimpleImputer(strategy='mean') 
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

print("Before SMOTE:", Counter(y_train))

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_imputed, y_train)

print("After SMOTE:", Counter(y_train_res))

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test_imputed)

Before SMOTE: Counter({1: 694, 0: 106})
After SMOTE: Counter({1: 694, 0: 694})


In [62]:
## Model setup section TODO
param_grid = {
    'depth': [6, 8, 10],
    'learning_rate': [0.1, 0.025, 0.4],
    'iterations': [500, 550, 600],
    'l2_leaf_reg': [5, 7, 9]
}

model = CatBoostClassifier(verbose=0)
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=cv_strategy, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train_res)

print("Best Parameters:", grid_search.best_params_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, predictions)
print(f'Accuracy: {accuracy}, ROC-AUC: {roc_auc}')

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'depth': 8, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
Accuracy: 0.91, ROC-AUC: 0.8010610079575597


In [63]:
feature_importances = best_model.feature_importances_
feature_names = X.columns
feature_importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print(feature_importances_df.sort_values(by='Importance', ascending=False))

      Feature  Importance
7    feature8   17.564503
11  feature12   14.135845
0    feature1   10.710500
6    feature7   10.202093
2    feature3    9.892006
3    feature4    9.638696
9   feature10    6.344784
12  feature13    5.939379
10  feature11    5.144176
4    feature5    4.268038
5    feature6    2.508020
8    feature9    1.911199
1    feature2    1.740760


In [64]:
new_data = pd.read_csv('test.csv')
# new_data=new_data[important_features + ['Id']] # FIXME in case of feature importance change
new_data_imputed = imputer.transform(new_data.drop('Id', axis=1))
new_data_scaled = scaler.transform(new_data_imputed)
new_predictions = best_model.predict(new_data_scaled)

submission = pd.DataFrame({'Id': new_data['Id'], 'target': new_predictions})
submission.to_csv('final_submission_advanced_13_late_submission.csv', index=False)
print("Model predictions saved.")

Model predictions saved.
