In [1]:
import pandas as pd

TRAIN = '../input/spaceship-titanic/train.csv'
train_df = pd.read_csv(filepath_or_buffer=TRAIN)
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

def prepare(input_df: pd.DataFrame) -> np.ndarray:
    result_df = input_df.drop(columns=['Transported', ], errors='ignore').copy(deep=True)
    result_df['Cabin_Deck'] = result_df['Cabin'].apply(func=lambda x: np.nan if isinstance(x, float) else x.split('/')[0])
    result_df['Cabin_Side'] = result_df['Cabin'].apply(func=lambda x: np.nan if isinstance(x, float) else x.split('/')[2])
    result_df['Party'] = result_df['PassengerId'].apply(func=lambda x: np.nan if isinstance(x, float) else int(x.split('_')[0]))
    result_df['Person'] = result_df['PassengerId'].apply(func=lambda x: np.nan if isinstance(x, float) else int(x.split('_')[1]))
    for column in ['CryoSleep', 'Destination', 'HomePlanet', 'Cabin_Deck', 'Cabin_Side', 'VIP', ]:
        result_df[column] = LabelEncoder().fit_transform(y=result_df[column])
    result_df = result_df.drop(columns=['Cabin', 'Name', 'PassengerId',])
    return result_df

y = train_df['Transported'].values
imputer = KNNImputer(missing_values=np.nan, n_neighbors=2, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False,)
x_df = prepare(input_df=train_df)
x = imputer.fit_transform(X=x_df)

In [3]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 2024
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
print('we have {} rows of training data and {} rows of validation data'.format(len(x), len(x_test)))

we have 6954 rows of training data and 1739 rows of validation data


In [4]:
from lightgbm import LGBMClassifier

METRIC = ['binary_error', 'binary_logloss'][1]
IMPORTANCE_TYPE = ['gain', 'split'][1]
model = LGBMClassifier(
    boosting_type='gbdt', 
    class_weight=None, 
    colsample_bytree=1.0,
    device_type='cpu',
    early_stopping_round=20,
    deterministic=True,
    importance_type=IMPORTANCE_TYPE,
    learning_rate = 1e-2,
    max_depth=-1,  
    metric=METRIC,
    min_child_weight=1e-2, 
    min_child_samples=20, 
    min_split_gain=0.0, 
    n_estimators=500, 
    n_jobs=None, 
    num_boost_round=10000,
    num_leaves=31,# was 31
    objective=None,
    random_state=RANDOM_STATE,
    reg_alpha=0.0, 
    reg_lambda=0.0, 
    seed=RANDOM_STATE, 
    subsample_for_bin=2000, 
    subsample=1.0, 
    subsample_freq=0, 
    use_missing=False,
)
# print('build model {}'.format(model))
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.01,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.01,
 'min_split_gain': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': 2024,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 2000,
 'subsample_freq': 0,
 'device_type': 'cpu',
 'early_stopping_round': 20,
 'deterministic': True,
 'metric': 'binary_logloss',
 'num_boost_round': 10000,
 'seed': 2024,
 'use_missing': False}

In [5]:
import warnings

warnings.filterwarnings(action='ignore', category=UserWarning, module='lightgbm.engine', lineno=177)

model.fit(X=x, y=y, eval_set=(x_test, y_test), feature_name='auto', categorical_feature='auto', )
# model.fit(X=x, y=y, eval_set=(x_test, y_test),  categorical_feature=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Side', 'Party', 'Person'], )

[1]	valid_0's binary_logloss: 0.688972
[2]	valid_0's binary_logloss: 0.684911
[3]	valid_0's binary_logloss: 0.680933
[4]	valid_0's binary_logloss: 0.677036
[5]	valid_0's binary_logloss: 0.673218
[6]	valid_0's binary_logloss: 0.669471
[7]	valid_0's binary_logloss: 0.665847
[8]	valid_0's binary_logloss: 0.662255
[9]	valid_0's binary_logloss: 0.658713
[10]	valid_0's binary_logloss: 0.655342
[11]	valid_0's binary_logloss: 0.651931
[12]	valid_0's binary_logloss: 0.648577
[13]	valid_0's binary_logloss: 0.645323
[14]	valid_0's binary_logloss: 0.642104
[15]	valid_0's binary_logloss: 0.638952
[16]	valid_0's binary_logloss: 0.635753
[17]	valid_0's binary_logloss: 0.632733
[18]	valid_0's binary_logloss: 0.62973
[19]	valid_0's binary_logloss: 0.626844
[20]	valid_0's binary_logloss: 0.623856
[21]	valid_0's binary_logloss: 0.620973
[22]	valid_0's binary_logloss: 0.618209
[23]	valid_0's binary_logloss: 0.615512
[24]	valid_0's binary_logloss: 0.61284
[25]	valid_0's binary_logloss: 0.6102
[26]	valid_0'

LGBMClassifier(deterministic=True, device_type='cpu', early_stopping_round=20,
               learning_rate=0.01, metric='binary_logloss',
               min_child_weight=0.01, n_estimators=500, n_jobs=None,
               num_boost_round=10000, random_state=2024, seed=2024,
               subsample_for_bin=2000, use_missing=False)

In [6]:
from plotly import express

express.line(data_frame=pd.DataFrame.from_dict(orient='columns', data=dict(model.evals_result_['valid_0'])), y=METRIC)

In [7]:
from plotly import express

express.histogram(data_frame=pd.Series(model.feature_importances_, index=x_df.columns).to_frame().reset_index(), x='index', y=0)

In [8]:
submission_df = pd.read_csv(filepath_or_buffer='../input/spaceship-titanic/test.csv')
submission_data_df = prepare(input_df=submission_df)
output_df = pd.DataFrame({'PassengerId': submission_df['PassengerId'].values,
                       'Transported': model.predict(imputer.transform(submission_data_df)) == 1.0})
output_df.to_csv('submission.csv', index=False)

In [9]:
y_pred = model.predict(X=imputer.transform(submission_data_df))
values, counts = np.unique(y_pred, return_counts=True)
dict(zip(values, counts))
express.line(sorted(y_pred.tolist()), )