In [1]:
from arrow import now
from pandas import get_dummies
from pandas import read_csv
from pandas import DataFrame
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_fname = '../input/spaceship-titanic/train.csv'
train_df = read_csv(filepath_or_buffer=train_fname)
print('{}: train shape: {}'.format(now(), train_df.shape))

2022-12-12T19:09:59.026598+00:00: train shape: (8693, 14)


In [3]:
# use a function to prepare our data so we can apply the same transformation to train and test
def prepare(input_df: DataFrame) -> DataFrame:
    columns_float = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',]
    columns_dummies = ['CryoSleep', 'Destination', 'HomePlanet', 'VIP',]
    columns_other = ['Cabin']
    columns = columns_float + columns_dummies + columns_other
    result_df = input_df[columns].copy(deep=True)
    result_df['Cabin_Deck'] = result_df['Cabin'].apply(func=lambda x: 'Unknown' if isinstance(x, float) else x.split('/')[0])
    result_df['Cabin_Side'] = result_df['Cabin'].apply(func=lambda x: 'Unknown' if isinstance(x, float) else x.split('/')[2])
    columns_dummies += ['Cabin_Deck', 'Cabin_Side']
    # fill in and tag missing values for the float columns
    for column in columns_float:
        mean_value = result_df[column].dropna().mean()
        result_df[column+'_isnull'] = result_df[column].isnull().astype(int)
        result_df[column] = result_df[column].fillna(mean_value)
    # add dummy values 
    result_df = get_dummies(data=result_df, columns=columns_dummies).drop(columns=columns_other)
    return result_df

prepared_df = prepare(input_df=train_df)

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
prepared_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   RoomService                8693 non-null   float64
 2   FoodCourt                  8693 non-null   float64
 3   ShoppingMall               8693 non-null   float64
 4   Spa                        8693 non-null   float64
 5   VRDeck                     8693 non-null   float64
 6   Age_isnull                 8693 non-null   int64  
 7   RoomService_isnull         8693 non-null   int64  
 8   FoodCourt_isnull           8693 non-null   int64  
 9   ShoppingMall_isnull        8693 non-null   int64  
 10  Spa_isnull                 8693 non-null   int64  
 11  VRDeck_isnull              8693 non-null   int64  
 12  CryoSleep_False            8693 non-null   uint8  
 13  CryoSleep_True             8693 non-null   uint8

In [6]:
model_choices = ['ada_boost', 'decision_tree', 'dummy_classifier', 'logistic_regression', 'naive_bayes', 'random_forest',
                'voting_classifier']
model_choice = model_choices[6]
print('model: {}'.format(model_choice))
random_state = 1
use_scaler = False
if model_choice not in model_choices:
    raise NotImplementedError()
elif model_choice == 'ada_boost':
    # train: 0.7935 test: 0.7863
    model = AdaBoostClassifier(random_state=random_state)
elif model_choice == 'decision_tree':
    # train: 0.9580 test: 0.7278 (gini)
    # train: 0.9580 test: 0.7447 (entropy)
    criterion = ['entropy', 'gini'][0]
    model = DecisionTreeClassifier(criterion=criterion, random_state=random_state)
elif model_choice == 'dummy_classifier':
    # train: 0.5036 test: 5069
    model = DummyClassifier(random_state=random_state, strategy = 'most_frequent')
elif model_choice == 'logistic_regression': 
    # train: 0.7921 # test: 0.7912
    fit_intercept = True
    max_iter = 4000
    penalty = ['none', 'l1', 'l2', 'elasticnet'][2]
    solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'][0]
    model = LogisticRegression(fit_intercept=fit_intercept, max_iter=max_iter, penalty=penalty, random_state=random_state,
                              solver=solver, verbose=0)
elif model_choice == 'naive_bayes':
    # train: 0.7616 test: 0.7662
    model = GaussianNB()
elif model_choice == 'random_forest':
    # train: 0.9579 test: 0.7870
    model = RandomForestClassifier(random_state=random_state)
elif model_choice == 'voting_classifier':
    # train: 0.8340 test: 0.7849 (3 models)
    # train: 0.8472 test: 0.7942 (5 models)
    estimators = [
        ('ada_boost', AdaBoostClassifier(random_state=random_state)),
        ('decision_tree', DecisionTreeClassifier(criterion='entropy', random_state=random_state)),
        ('logistic_regression', LogisticRegression(random_state=random_state)),
        ('naive_bayes', GaussianNB()),
        ('random_forest', RandomForestClassifier(random_state=random_state)),
    ]
    model = VotingClassifier(estimators=estimators, voting='hard', n_jobs=len(estimators))
if use_scaler:
    model = make_pipeline(StandardScaler(), model)

model.fit(X=prepared_df, y=train_df['Transported'])
# get the accuracy score
score = accuracy_score(y_true=train_df['Transported'], y_pred=model.predict(X=prepared_df))
print('{}: score: {:0.4f}'.format(now(), score))

model: voting_classifier
2022-12-12T19:10:02.819391+00:00: score: 0.8472


In [7]:
# now predict
test_fname = '../input/spaceship-titanic/test.csv'
test_df = read_csv(filepath_or_buffer=test_fname)
print('{}: test shape: {}'.format(now(), test_df.shape))
prepared_test_df = prepare(input_df=test_df)
test_df['Transported'] = model.predict(X=prepared_test_df)
test_df['Transported'] = test_df['Transported'].apply(lambda x: x == 1)


2022-12-12T19:10:02.856125+00:00: test shape: (4277, 13)


In [8]:
result_columns = ['PassengerId', 'Transported']
result_fname = './submission.csv'
test_df[result_columns].to_csv(index=False, path_or_buf=result_fname,)
print('{}: wrote submission to {}'.format(now(), result_fname))   

2022-12-12T19:10:03.101070+00:00: wrote submission to ./submission.csv
