In [None]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nnpreds/neural3.csv
/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [None]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## Datatype transformations basic feature engineering

In [None]:
train['Transported'].replace(False, 0, inplace=True)
train['Transported'].replace(True, 1, inplace=True)

train.Transported = train.Transported.astype(int)

train['VIP'].replace(False, 0, inplace=True)
train['VIP'].replace(True, 1, inplace=True)
test['VIP'].replace(False, 0, inplace=True)
test['VIP'].replace(True, 1, inplace=True)

test.VIP = test.VIP.astype("Int8")
train.VIP = train.VIP.astype("Int8")

train.CryoSleep.replace(False, 0, inplace=True)
test.CryoSleep.replace(False, 0, inplace=True)

train.CryoSleep.replace(True, 1, inplace=True)
test.CryoSleep.replace(True, 1, inplace=True)

test.CryoSleep = test.CryoSleep.astype("Int8")
train.CryoSleep = train.CryoSleep.astype("Int8")

In [None]:
train[['deck','num', 'side']] = train['Cabin'].str.split('/', expand=True)
test[['deck','num', 'side']] = test['Cabin'].str.split('/', expand=True)

train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)

In [None]:
col_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train['SumSpends'] = train[col_to_sum].sum(axis=1)
test['SumSpends'] = test[col_to_sum].sum(axis=1)

train['MaxSpends'] = train[col_to_sum].max(axis=1)
test['MaxSpends'] = test[col_to_sum].max(axis=1)

In [None]:
train['log_spend'] = np.log(train.SumSpends + 1)
test['log_spend'] = np.log(test.SumSpends + 1)

In [None]:
null_cols = train.isnull().sum().sort_values(ascending=False)
null_cols = list(null_cols[null_cols>1].index)

In [None]:
object_cols = [col for col in train.columns if train[col].dtype == 'object' or train[col].dtype == 'category']
from sklearn.preprocessing import OrdinalEncoder
oc = OrdinalEncoder()
df_for_encode = pd.concat([train, test])
df_for_encode[object_cols] = df_for_encode[object_cols].astype('category')
df_for_encode[object_cols] = oc.fit_transform(df_for_encode[object_cols])
del train, test
train = df_for_encode.iloc[:8693, :]
test = df_for_encode.iloc[8693: , :]
del df_for_encode
test.drop('Transported', inplace=True, axis=1)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("imp", SimpleImputer(strategy='mean'), null_cols)])
train[null_cols] = ct.fit_transform(train[null_cols])
test[null_cols] = ct.transform(test[null_cols])

In [None]:
train.drop('PassengerId', axis=1, inplace = True)
test.drop('PassengerId', axis=1, inplace = True)
y_train = train['Transported']
X_train = train.drop('Transported', axis=1)
X_test = test
if X_train.shape[1] == X_test.shape[1]:
    print('Shapes are equal. We are ready to train models.')
else:
    print('There is something wrong in preprocessing steps.')

Shapes are equal. We are ready to train models.


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

cv = StratifiedKFold(n_splits = 4, shuffle=True, random_state=34)
fold_accuracy = []
for tr_idx, val_idx in cv.split(X_train, y_train):
    X_tr, y_tr, X_val, y_val = X_train.iloc[tr_idx], y_train.iloc[tr_idx], X_train.iloc[val_idx], y_train.iloc[val_idx]
    model = CatBoostClassifier(eval_metric = 'Accuracy', verbose=0, rsm = 0.82, iterations = 700)
    model.fit(X_tr, y_tr)
    pred_fold = model.predict(X_val)
    fold_accuracy.append(accuracy_score(pred_fold, y_val))

print(f'Accuracy = {np.mean(fold_accuracy)}')
print(f'fold accuracy: {fold_accuracy}')

Accuracy = 0.8135293755299948
fold accuracy: [0.8026678932842686, 0.8177634606534745, 0.8039576622181316, 0.829728485964105]


In [None]:
predictions = pd.DataFrame({})
for i in range(10):
    catb = CatBoostClassifier(eval_metric = 'Accuracy', verbose=0, rsm = 0.82, iterations = 700, random_seed = i*7 + 4)
    catb.fit(X_train, y_train)
    print(f"Train accuracy: {accuracy_score(y_train, catb.predict(X_train))}")
    predictions[f"CATB_{i}"] = catb.predict_proba(X_test)[:, 1]

Train accuracy: 0.882319107327735
Train accuracy: 0.8836995283561486
Train accuracy: 0.8847348441274588
Train accuracy: 0.8855400897273669
Train accuracy: 0.8851949844702635
Train accuracy: 0.8839295985275509
Train accuracy: 0.8838145634418497
Train accuracy: 0.8836995283561486
Train accuracy: 0.8836995283561486
Train accuracy: 0.8855400897273669


In [None]:
from xgboost import XGBClassifier
fold_accuracy = []
for tr_idx, val_idx in cv.split(X_train, y_train):
    X_tr, y_tr, X_val, y_val = X_train.iloc[tr_idx], y_train.iloc[tr_idx], X_train.iloc[val_idx], y_train.iloc[val_idx]
    model = XGBClassifier(max_depth = 4, subsample = 0.75, n_estimators = 550, learning_rate = 0.03, min_child_weight = 0.9, random_state = 1)
    model.fit(X_tr, y_tr)
    pred_fold = model.predict(X_val)
    fold_accuracy.append(accuracy_score(pred_fold, y_val))

print(f'Accuracy = {np.mean(fold_accuracy)}')
print(f'fold accuracy: {fold_accuracy}')

Accuracy = 0.8073176658759698
fold accuracy: [0.7948482060717571, 0.8191440404970087, 0.7975149562816383, 0.8177634606534745]


In [None]:
for i in range(10):
    xgb = XGBClassifier(max_depth = 4, subsample = 0.75, n_estimators = 550, learning_rate = 0.03, min_child_weight = 0.9, random_state = i*2 + 1)
    xgb.fit(X_train, y_train)
    print(f"Train Accuracy: {accuracy_score(y_train, xgb.predict(X_train))}")
    predictions[f"XGB_{i}"] = xgb.predict_proba(X_test)[:, 1]

Train Accuracy: 0.8564362130449787
Train Accuracy: 0.8547106867594616
Train Accuracy: 0.8549407569308639
Train Accuracy: 0.853330265731048
Train Accuracy: 0.8554008972736684
Train Accuracy: 0.8549407569308639
Train Accuracy: 0.8556309674450707
Train Accuracy: 0.8551708271022662
Train Accuracy: 0.8556309674450707
Train Accuracy: 0.8544806165880594


In [None]:
from lightgbm import LGBMClassifier
fold_accuracy = []
for tr_idx, val_idx in cv.split(X_train, y_train):
    X_tr, y_tr, X_val, y_val = X_train.iloc[tr_idx], y_train.iloc[tr_idx], X_train.iloc[val_idx], y_train.iloc[val_idx]
    model = LGBMClassifier(min_child_weight=0.8, random_state=1, n_estimators=600, learning_rate = 0.01, subsample=0.7, subsample_freq=1, colsample_bytree = 0.85)
    model.fit(X_tr, y_tr)
    pred_fold = model.predict(X_val)
    fold_accuracy.append(accuracy_score(pred_fold, y_val))

print(f'Accuracy = {np.mean(fold_accuracy)}')
print(f'fold accuracy: {fold_accuracy}')

Accuracy = 0.8104228592016007
fold accuracy: [0.8045078196872125, 0.8117809479981593, 0.8062586286240221, 0.8191440404970087]


In [None]:
for i in range(10):
    lgb = LGBMClassifier(min_child_weight=0.8, random_state=i*5 + 6, n_estimators=600, learning_rate = 0.01, subsample=0.7, subsample_freq=1, colsample_bytree = 0.85)
    lgb.fit(X_train, y_train)
    print(f"Train Accuracy: {accuracy_score(y_train, lgb.predict(X_train))}")
    predictions[f"LGB_{i}"] = lgb.predict_proba(X_test)[:, 1]

Train Accuracy: 0.8620729322443346
Train Accuracy: 0.8632232831013459
Train Accuracy: 0.8631082480156448
Train Accuracy: 0.8634533532727482
Train Accuracy: 0.8631082480156448
Train Accuracy: 0.864143563786955
Train Accuracy: 0.8631082480156448
Train Accuracy: 0.8603474059588174
Train Accuracy: 0.8637984585298516
Train Accuracy: 0.8631082480156448


In [None]:
predictions

Unnamed: 0,CATB_0,CATB_1,CATB_2,CATB_3,CATB_4,CATB_5,CATB_6,CATB_7,CATB_8,CATB_9,...,LGB_0,LGB_1,LGB_2,LGB_3,LGB_4,LGB_5,LGB_6,LGB_7,LGB_8,LGB_9
0,0.497882,0.510471,0.506180,0.506108,0.508351,0.493178,0.488667,0.499579,0.516247,0.500527,...,0.503036,0.484318,0.504573,0.488331,0.473934,0.500993,0.483331,0.506074,0.495155,0.478201
1,0.008502,0.009995,0.009930,0.009653,0.010118,0.010085,0.012606,0.009440,0.009513,0.007029,...,0.027696,0.023711,0.028096,0.025474,0.026661,0.024078,0.025711,0.025755,0.026196,0.026953
2,0.987550,0.987205,0.989135,0.989356,0.988090,0.991690,0.983654,0.991016,0.988097,0.988860,...,0.986077,0.985884,0.986530,0.985316,0.985799,0.985372,0.984801,0.986628,0.985154,0.985810
3,0.958154,0.978107,0.975138,0.963491,0.982041,0.965070,0.958512,0.961243,0.977743,0.976171,...,0.979216,0.976977,0.977401,0.975618,0.977401,0.977937,0.979969,0.978059,0.980742,0.977690
4,0.441696,0.404784,0.467650,0.445773,0.437000,0.428519,0.398749,0.498477,0.381234,0.459468,...,0.555487,0.559281,0.559480,0.514732,0.555586,0.538672,0.555605,0.566818,0.522659,0.570601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0.567760,0.567971,0.579312,0.568902,0.574212,0.567316,0.570723,0.609953,0.569184,0.577594,...,0.582040,0.569525,0.563353,0.570644,0.557133,0.576950,0.563570,0.567219,0.559196,0.555340
4273,0.392227,0.337847,0.354892,0.359964,0.354514,0.355536,0.340354,0.346836,0.316497,0.383349,...,0.347620,0.345438,0.348926,0.313218,0.311667,0.348509,0.372706,0.358807,0.342450,0.334307
4274,0.927754,0.933207,0.922581,0.920993,0.920594,0.929109,0.913531,0.923110,0.935889,0.911757,...,0.948987,0.954905,0.951652,0.947809,0.946062,0.942756,0.956614,0.955638,0.952143,0.948141
4275,0.812156,0.816479,0.771336,0.790336,0.821816,0.823036,0.810554,0.869337,0.825411,0.852225,...,0.797946,0.754107,0.773352,0.763374,0.747373,0.740391,0.755998,0.764890,0.761909,0.757847


In [None]:
X_tr, y_tr, X_val, y_val = X_train.iloc[tr_idx], y_train.iloc[tr_idx], X_train.iloc[val_idx], y_train.iloc[val_idx]
model = CatBoostClassifier(eval_metric = 'Accuracy', verbose=0, rsm = 0.82, iterations = 700)
model.fit(X_tr, y_tr)
prediction = model.predict(X_val)
final = pd.DataFrame()
final.index = test.index
final['Transported'] = prediction
final['Transported'].replace(0, False, inplace=True)
final['Transported'].replace(1, True, inplace=True)
final.to_csv('final_output.csv')