In [32]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from catboost import CatBoostClassifier
import time

In [33]:
train_data = pd.read_csv('../pre_data/pre_training.csv')
train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,cabin_deck,cabin_num,cabin_side
0,0001_01,Europa,0,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0.0,P
1,0002_01,Earth,0,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0.0,S
2,0003_01,Europa,0,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0.0,S
3,0003_02,Europa,0,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0.0,S
4,0004_01,Earth,0,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0,55 Cancri e,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,A,98.0,P
8689,9278_01,Earth,1,PSO J318.5-22,18.0,0.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,G,1499.0,S
8690,9279_01,Earth,0,TRAPPIST-1e,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,G,1500.0,S
8691,9280_01,Europa,0,55 Cancri e,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,E,608.0,S


In [34]:
test_data = pd.read_csv('../pre_data/pre_test.csv')
test_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,cabin_deck,cabin_num,cabin_side
0,0013_01,Earth,1,TRAPPIST-1e,27.0,0.0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,G,3.0,S
1,0018_01,Earth,0,TRAPPIST-1e,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,F,4.0,S
2,0019_01,Europa,1,55 Cancri e,31.0,0.0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,C,0.0,S
3,0021_01,Europa,0,TRAPPIST-1e,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,C,1.0,S
4,0023_01,Earth,0,TRAPPIST-1e,20.0,0.0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,F,5.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,1,TRAPPIST-1e,34.0,0.0,0.0,0.0,0.0,0.0,0.0,Jeron Peter,G,1496.0,S
4273,9269_01,Earth,0,TRAPPIST-1e,42.0,0.0,0.0,847.0,17.0,10.0,144.0,Matty Scheron,G,1044.0,S
4274,9271_01,Mars,1,55 Cancri e,26.0,0.0,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,D,296.0,P
4275,9273_01,Europa,0,55 Cancri e,26.0,0.0,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,D,297.0,P


In [35]:
for df in [train_data, test_data]:
    df["VIP"] = df["VIP"].astype(str).str.lower().map({
        "true": 1.0,
        "false": 0.0,
        "1": 1.0,
        "0": 0.0,
        "1.0": 1.0,
        "0.0": 0.0
    }).fillna(0.0).astype(float)

In [36]:
train_data = train_data.drop(columns='Name')
test_data = test_data.drop(columns='Name')

### Feature Engineering

---

In [37]:
# ---------- TOTAL SPENDING ----------
train_data["TotalSpent"] = (
    train_data["RoomService"] +
    train_data["FoodCourt"] +
    train_data["ShoppingMall"] +
    train_data["Spa"] +
    train_data["VRDeck"]
)

test_data["TotalSpent"] = (
    test_data["RoomService"] +
    test_data["FoodCourt"] +
    test_data["ShoppingMall"] +
    test_data["Spa"] +
    test_data["VRDeck"]
)

In [38]:
train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,cabin_deck,cabin_num,cabin_side,TotalSpent
0,0001_01,Europa,0,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0,B,0.0,P,0.0
1,0002_01,Earth,0,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1,F,0.0,S,736.0
2,0003_01,Europa,0,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0,A,0.0,S,10383.0
3,0003_02,Europa,0,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0,A,0.0,S,5176.0
4,0004_01,Earth,0,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1,F,1.0,S,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0,55 Cancri e,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,0,A,98.0,P,8536.0
8689,9278_01,Earth,1,PSO J318.5-22,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0,G,1499.0,S,0.0
8690,9279_01,Earth,0,TRAPPIST-1e,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,1,G,1500.0,S,1873.0
8691,9280_01,Europa,0,55 Cancri e,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,0,E,608.0,S,4637.0


In [39]:
psg_id = train_data['PassengerId']
train_data = train_data.drop(columns='PassengerId')

In [40]:
y = train_data['Transported']
X = train_data.drop(columns='Transported')
bool_cols = X.select_dtypes(include='bool').columns
for col in bool_cols:
    X[col] = X[col].astype(int)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.195, stratify=y, random_state=42)
cat_features = ["HomePlanet", "Destination", "cabin_deck", "cabin_side"]

for col in cat_features:
    X_train[col] = X_train[col].astype(str).fillna("Missing")
    X_test[col]  = X_test[col].astype(str).fillna("Missing")

In [42]:
model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    random_state=42
)

param_grid = {
    'iterations': [1500, 2000, 2300, 2500],
    'learning_rate': [0.015, 0.16, 0.17, 0.18, 0.02],
    'depth': [5, 6, 7, 8],
}

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1
)

In [43]:
start = time.time()
grid.fit(X_train, y_train, cat_features=cat_features, verbose=True)
end = time.time()

0:	total: 20.7ms	remaining: 41.3s
1:	total: 43.7ms	remaining: 43.7s
2:	total: 65.3ms	remaining: 43.5s
3:	total: 81.8ms	remaining: 40.8s
4:	total: 101ms	remaining: 40.5s
5:	total: 121ms	remaining: 40.4s
6:	total: 142ms	remaining: 40.5s
7:	total: 162ms	remaining: 40.4s
8:	total: 183ms	remaining: 40.4s
9:	total: 203ms	remaining: 40.4s
10:	total: 224ms	remaining: 40.4s
11:	total: 245ms	remaining: 40.6s
12:	total: 265ms	remaining: 40.6s
13:	total: 289ms	remaining: 41s
14:	total: 309ms	remaining: 40.8s
15:	total: 332ms	remaining: 41.2s
16:	total: 353ms	remaining: 41.2s
17:	total: 373ms	remaining: 41.1s
18:	total: 397ms	remaining: 41.4s
19:	total: 420ms	remaining: 41.6s
20:	total: 443ms	remaining: 41.8s
21:	total: 470ms	remaining: 42.3s
22:	total: 492ms	remaining: 42.3s
23:	total: 515ms	remaining: 42.4s
24:	total: 536ms	remaining: 42.3s
25:	total: 557ms	remaining: 42.3s
26:	total: 578ms	remaining: 42.3s
27:	total: 601ms	remaining: 42.3s
28:	total: 622ms	remaining: 42.3s
29:	total: 642ms	remai

In [44]:
print(grid.best_params_)

{'depth': 6, 'iterations': 2000, 'learning_rate': 0.015}


In [45]:
print("Training time:", end - start, "seconds")

Training time: 2664.5554807186127 seconds


In [46]:
best_model = grid.best_estimator_

In [47]:
X_test = test_data.copy()
passenger_ids = X_test["PassengerId"]
X_test = X_test.drop(columns=["PassengerId"])
for col in cat_features:
    X_test[col] = X_test[col].astype(str).fillna("Missing")

In [48]:
pred = best_model.predict(X_test)

In [49]:
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": pred.astype(bool)
})
submission.to_csv("submission.csv", index=False)

In [50]:
# Вземаме важностите
importances = model.get_feature_importance()
feature_names = X_train.columns

# Създаваме DataFrame
fi_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

# Показваме таблицата
print(fi_df)

# Графика (по избор)
plt.figure(figsize=(10, 12))
plt.barh(fi_df["feature"], fi_df["importance"])
plt.gca().invert_yaxis()
plt.title("CatBoost Feature Importances")
plt.xlabel("Importance")
plt.show()

CatBoostError: Model has no meta information needed to calculate feature importances.                             Pass training dataset to this function.