In [1]:
import pandas as pd

TRAIN = '/kaggle/input/spaceship-titanic/train.csv'

train_df = pd.read_csv(filepath_or_buffer=TRAIN)
# let's do the dumbest thing possible and just replace NaNs with zeros
train_df = train_df.fillna(0)
# and let's break out the deck as a separate variable
train_df['deck'] = train_df['Cabin'].str[0].fillna('-')
train_df['side'] = train_df['Cabin'].str[-1].fillna('-')
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,deck,side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,S


Let's see how the features we have introduced break down according to the target variable.

In [2]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.histogram(data_frame=train_df, x='deck', color='Transported').show()
express.histogram(data_frame=train_df, x='side', color='Transported').show()

The differences are small but noticeable.

In [3]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

COLUMNS = ['CryoSleep', 'Age', 'RoomService', 'VIP', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
CATS = ['deck', 'side', 'HomePlanet', 'Destination']

TARGET = 'Transported'

X_train, X_test, y_train, y_test = train_test_split(train_df, train_df[TARGET], test_size=0.20, random_state=2024, stratify=train_df[TARGET])

XX_train = pd.concat(axis='columns', objs=[X_train[COLUMNS], pd.get_dummies(X_train[CATS])])
XX_test = pd.concat(axis='columns', objs=[X_test[COLUMNS], pd.get_dummies(X_test[CATS])])

time_start = arrow.now()
regression = LogisticRegression(max_iter=1400, tol=1e-12).fit(X=XX_train, y=y_train)
print('model fit in {} iterations took {}'.format(regression.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=regression.predict(X=XX_test))))
print('model done in {}'.format(arrow.now() - time_start))

model fit in 789 iterations took 0:00:01.104066
accuracy: 0.7890
model done in 0:00:01.113219


In [4]:
from plotly import express

express.histogram(x=XX_test.columns.tolist(), y=regression.coef_[0])

In [5]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=regression.predict(X=XX_test)))

              precision    recall  f1-score   support

       False       0.81      0.76      0.78       863
        True       0.77      0.82      0.80       876

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



Let's make a submission.

In [6]:
TEST = '/kaggle/input/spaceship-titanic/test.csv'
test_df = pd.read_csv(filepath_or_buffer=TEST).fillna(0)
test_df['deck'] = test_df['Cabin'].str[0].fillna('-')
test_df['side'] = test_df['Cabin'].str[-1].fillna('-')

result_df = test_df[['PassengerId']]
test_df = pd.concat(axis='columns', objs=[test_df[COLUMNS], pd.get_dummies(test_df[CATS])])

result_df['Transported'] = regression.predict(X=test_df)
result_df.to_csv(path_or_buf='/kaggle/working/submission.csv', index=False)