In [1]:
import pandas as pd

TRAIN = '../input/spaceship-titanic/train.csv'
train_df = pd.read_csv(filepath_or_buffer=TRAIN)
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

def prepare(input_df: pd.DataFrame) -> np.ndarray:
    result_df = input_df.drop(columns=['Transported', ], errors='ignore').copy(deep=True)
    result_df['Cabin_Deck'] = result_df['Cabin'].apply(func=lambda x: 'Unknown' if isinstance(x, float) else x.split('/')[0])
    result_df['Cabin_Side'] = result_df['Cabin'].apply(func=lambda x: 'Unknown' if isinstance(x, float) else x.split('/')[2])
    for column in ['CryoSleep', 'Destination', 'HomePlanet', 'Cabin_Deck', 'Cabin_Side', 'VIP']:
        result_df[column] = LabelEncoder().fit_transform(y=result_df[column])
    result_df = result_df.drop(columns=['Cabin', 'Name', 'PassengerId',])
    return result_df

# get the labels
y = train_df['Transported'].values
x_df = prepare(input_df=train_df)
imputer = KNNImputer(missing_values=np.nan, n_neighbors=2, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False,)
x = imputer.fit_transform(x_df)

In [3]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 2024
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
print('we have {} rows of training data and {} rows of validation data'.format(len(x), len(x_test)))

we have 6954 rows of training data and 1739 rows of validation data


In [4]:
from lightgbm import LGBMClassifier

METRIC = ['binary_error', 'binary_logloss'][1]
IMPORTANCE_TYPE = ['gain', 'split'][1]
model = LGBMClassifier(
    boosting_type='gbdt', 
    class_weight=None, 
    colsample_bytree=1.0, 
    early_stopping_round=20,
    deterministic=True,
    importance_type=IMPORTANCE_TYPE,
    learning_rate = 5e-1, # was 1e-2
    max_depth=-1,  
    metric=METRIC,
    min_child_weight=0.001, 
    min_child_samples=20, 
    min_split_gain=0.0, 
    n_estimators=100, 
    n_jobs=None, 
    num_iterations=10000,
    num_leaves=31,# was 31
    objective=None, 
    reg_alpha=0.0, 
    reg_lambda=0.0, 
    seed=RANDOM_STATE, 
    subsample_for_bin=200000, 
    subsample=1.0, 
    subsample_freq=0, 
)
print('build model {}'.format(model))

build model LGBMClassifier(deterministic=True, early_stopping_round=20, learning_rate=0.5,
               metric='binary_logloss', n_jobs=None, num_iterations=10000,
               seed=2024)


In [5]:
model.fit(X=x, y=y, eval_set=(x_test, y_test), feature_name='auto', categorical_feature='auto', )

[1]	valid_0's binary_logloss: 0.542751
[2]	valid_0's binary_logloss: 0.486796
[3]	valid_0's binary_logloss: 0.460277
[4]	valid_0's binary_logloss: 0.444679
[5]	valid_0's binary_logloss: 0.436212
[6]	valid_0's binary_logloss: 0.434277
[7]	valid_0's binary_logloss: 0.428172
[8]	valid_0's binary_logloss: 0.424124
[9]	valid_0's binary_logloss: 0.419904
[10]	valid_0's binary_logloss: 0.419732
[11]	valid_0's binary_logloss: 0.422398
[12]	valid_0's binary_logloss: 0.425261
[13]	valid_0's binary_logloss: 0.42731
[14]	valid_0's binary_logloss: 0.428877
[15]	valid_0's binary_logloss: 0.429964
[16]	valid_0's binary_logloss: 0.429816
[17]	valid_0's binary_logloss: 0.432029
[18]	valid_0's binary_logloss: 0.432184
[19]	valid_0's binary_logloss: 0.432137
[20]	valid_0's binary_logloss: 0.433621
[21]	valid_0's binary_logloss: 0.43282
[22]	valid_0's binary_logloss: 0.433274
[23]	valid_0's binary_logloss: 0.436136
[24]	valid_0's binary_logloss: 0.43734
[25]	valid_0's binary_logloss: 0.439291
[26]	valid_0



LGBMClassifier(deterministic=True, early_stopping_round=20, learning_rate=0.5,
               metric='binary_logloss', n_jobs=None, num_iterations=10000,
               seed=2024)

In [6]:
from plotly import express

express.line(y=model.evals_result_['valid_0'][METRIC])

In [7]:
from plotly import express

express.histogram(data_frame=pd.Series(model.feature_importances_, index=x_df.columns).to_frame().reset_index(), x='index', y=0)

In [8]:
submission_df = pd.read_csv(filepath_or_buffer='../input/spaceship-titanic/test.csv')
submission_data_df = prepare(input_df=submission_df)
output_df = pd.DataFrame({'PassengerId': submission_df['PassengerId'].values,
                       'Transported': model.predict(imputer.transform(submission_data_df)) == 1.0})
output_df.to_csv('submission.csv', index=False)

In [9]:
y_pred = model.predict(X=imputer.transform(submission_data_df))
values, counts = np.unique(y_pred, return_counts=True)
dict(zip(values, counts))
express.line(sorted(y_pred.tolist()), )