In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [38]:
training_df = pd.read_csv("data/train.csv")

In [39]:
y_train = training_df["Transported"]

In [81]:
def train_and_evaluate_model(model, X_train):
    print(model)
    scores = cross_validate(model, X_train, y_train, return_train_score=True, scoring=['accuracy'], return_estimator=True)
    for key, values in scores.items():
        if key not in ['fit_time', 'score_time', 'estimator']:
            values = [round(v, 3) for v in values]
            print(key, values)
    if 'estimator' in scores:
        estimators = scores['estimator']
        first_estimator = estimators[0]
        if "feature_importances_" in dir(first_estimator):
            f = list(zip(X_train.columns, first_estimator.feature_importances_))
            f.sort(key=lambda pair: -pair[1])
            print(f)

In [70]:
p = list(zip([1,3,2,7,5,8] , [3,1,6,4,8,2]))
p.sort(key=lambda pair: -pair[1])
p

[(5, 8), (2, 6), (7, 4), (1, 3), (8, 2), (3, 1)]

In [41]:
def train_and_evaluate_models(models, X_train):
    for model in models:
        train_and_evaluate_model(model, X_train)
        print("\n")

In [42]:
def calculate_features_and_train_and_evaluate_models(calculate_features, models):
    X_train = calculate_features(training_df)
    train_and_evaluate_models(models, X_train)

In [43]:
BASIC_MODELS = [
    linear_model.LogisticRegression(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
]

In [46]:
ADVANCED_MODELS = [
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.6500000000000001, min_samples_leaf=18, min_samples_split=11, n_estimators=100),
    XGBClassifier(learning_rate=0.5, max_depth=3, min_child_weight=15, n_estimators=100, n_jobs=1, subsample=0.7000000000000001, verbosity=0),
]

# Very simple models

In [11]:
def calculate_features1(df: pd.DataFrame) -> pd.DataFrame:
    df_copy = df.copy()

    df_copy.Age = df_copy.Age.fillna(28)
    df_copy["IsCryoSleep"] = df_copy["CryoSleep"].apply(lambda x: 1 if x else 0)

    return df_copy[["IsCryoSleep", "Age"]]

In [82]:
calculate_features_and_train_and_evaluate_models(calculate_features1, BASIC_MODELS)

LogisticRegression()
test_accuracy [0.706, 0.717, 0.712, 0.724, 0.729]
train_accuracy [0.721, 0.718, 0.719, 0.716, 0.715]


RandomForestClassifier()
test_accuracy [0.721, 0.731, 0.718, 0.745, 0.739]
train_accuracy [0.739, 0.737, 0.74, 0.733, 0.733]
[('IsCryoSleep', 0.7855990697139554), ('Age', 0.21440093028604465)]


GradientBoostingClassifier()
test_accuracy [0.724, 0.732, 0.722, 0.751, 0.746]
train_accuracy [0.739, 0.737, 0.739, 0.732, 0.733]
[('IsCryoSleep', 0.8382136911592668), ('Age', 0.16178630884073325)]




In [45]:
calculate_features_and_train_and_evaluate_models(calculate_features1, ADVANCED_MODELS)

RandomForestClassifier(criterion='entropy', max_features=0.6500000000000001,
                       min_samples_leaf=18, min_samples_split=11)
fit_time [0.43084693 0.30322623 0.30119348 0.31019187 0.29518342]
score_time [0.03490591 0.02689767 0.02792621 0.02792549 0.02795696]
test_accuracy [0.72, 0.73, 0.72, 0.74, 0.75]
train_accuracy [0.74, 0.74, 0.74, 0.73, 0.73]


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.5, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=15, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=1,
              num_paral

# Add more features

In [47]:
def calculate_features2(df: pd.DataFrame) -> pd.DataFrame:
    def map_age(df):
        df.Age = df.Age.fillna(28)

    def map_cryo_sleep(df):
        df["IsCryoSleep"] = df["CryoSleep"].apply(lambda x: 1 if x else 0)

    def map_cabin(df):
        df["Cabin_deck"] = df["Cabin"].apply(lambda x: x.split("/")[0] if type(x) == str else 'UNKNOWN')
        df["Cabin_num"] = df["Cabin"].apply(lambda x: x.split("/")[1] if type(x) == str else 0)
        #df["Cabin_side"] = df["Cabin"].apply(lambda x: x.split("/")[2] if type(x) == str else 'UNKNOWN')
        df["Cabin_side_is_port"] = df["Cabin"].apply(lambda x: (1 if x.split("/")[2] == 'P' else 0) if type(x) == str else 0)

    def map_vip(df):
        df["IsVIP"] = df["VIP"].apply(lambda x: 1 if x else 0)

    df_copy = df.copy()
    map_age(df_copy)
    map_vip(df_copy)
    map_cryo_sleep(df_copy)
    map_cabin(df_copy)

    mapped_features = [
        "IsCryoSleep",
        "Age",
        "IsVIP",
        #"Cabin_num",
        "Cabin_side_is_port",
    ]

    dummy_features = [
        "HomePlanet",
        "Destination",
        "Cabin_deck",
    ]
    features = mapped_features + dummy_features

    return pd.get_dummies(df_copy[features], columns=dummy_features)

In [49]:
calculate_features_and_train_and_evaluate_models(calculate_features2, ADVANCED_MODELS)

RandomForestClassifier()
fit_time [0.67270708 0.56530857 0.54750323 0.55328465 0.51661706]
score_time [0.04185009 0.04187584 0.04387999 0.04287863 0.04088998]
test_accuracy [0.69, 0.68, 0.68, 0.7, 0.7]
train_accuracy [0.85, 0.85, 0.85, 0.84, 0.85]


GradientBoostingClassifier()
fit_time [0.49377131 0.47273517 0.47995996 0.45977139 0.4667778 ]
score_time [0.00596189 0.00398469 0.00398898 0.00398779 0.00398946]
test_accuracy [0.73, 0.75, 0.73, 0.75, 0.74]
train_accuracy [0.75, 0.75, 0.75, 0.75, 0.75]


RandomForestClassifier(criterion='entropy', max_features=0.6500000000000001,
                       min_samples_leaf=18, min_samples_split=11)
fit_time [0.56003404 0.55648422 0.55056548 0.5634644  0.54949999]
score_time [0.02795792 0.03494453 0.02790189 0.02895117 0.02994823]
test_accuracy [0.73, 0.75, 0.74, 0.76, 0.74]
train_accuracy [0.76, 0.75, 0.76, 0.75, 0.75]


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
  

# Add number features as categories

In [51]:
def calculate_features3(df: pd.DataFrame) -> pd.DataFrame:
    def map_age(df):
        df.Age = df.Age.fillna(28)

    def map_cryo_sleep(df):
        df["IsCryoSleep"] = df["CryoSleep"].apply(lambda x: 1 if x else 0)

    def map_cabin(df):
        df["Cabin_deck"] = df["Cabin"].apply(lambda x: x.split("/")[0] if type(x) == str else 'UNKNOWN')
        df["Cabin_num"] = df["Cabin"].apply(lambda x: x.split("/")[1] if type(x) == str else 0)
        #df["Cabin_side"] = df["Cabin"].apply(lambda x: x.split("/")[2] if type(x) == str else 'UNKNOWN')
        df["Cabin_side_is_port"] = df["Cabin"].apply(lambda x: (1 if x.split("/")[2] == 'P' else 0) if type(x) == str else 0)

    def map_vip(df):
        df["IsVIP"] = df["VIP"].apply(lambda x: 1 if x else 0)

    def map_numbers_to_categories(df, column: str, buckets: int):
        df[column].fillna(np.mean(df[column]), inplace=True)
        df[column] = pd.qcut(df[column], buckets, labels=False, duplicates="drop")

    numbers_to_categories_features = [
        "RoomService",
        "FoodCourt",
        "ShoppingMall",
        "Spa",
        "VRDeck",
    ]

    mapped_features = [
        "IsCryoSleep",
        "Age",
        "IsVIP",
        #"Cabin_num",
        "Cabin_side_is_port",
    ]

    dummy_features = [
        "HomePlanet",
        "Destination",
        "Cabin_deck",
    ]
    features = mapped_features + dummy_features + numbers_to_categories_features

    df_copy = df.copy()
    map_age(df_copy)
    map_vip(df_copy)
    map_cryo_sleep(df_copy)
    map_cabin(df_copy)
    for c in numbers_to_categories_features:
        map_numbers_to_categories(df_copy, c, 10)

    return pd.get_dummies(df_copy[features], columns=dummy_features)

In [52]:
calculate_features_and_train_and_evaluate_models(calculate_features3, ADVANCED_MODELS)

RandomForestClassifier()
fit_time [0.70471215 0.58244205 0.59763241 0.58440852 0.58543968]
score_time [0.04986596 0.04089117 0.0409162  0.04188871 0.04288363]
test_accuracy [0.76, 0.76, 0.76, 0.77, 0.78]
train_accuracy [0.96, 0.96, 0.95, 0.95, 0.95]


GradientBoostingClassifier()
fit_time [0.62336898 0.60937524 0.6163795  0.61136031 0.65728045]
score_time [0.00497746 0.00598145 0.00498891 0.00398874 0.00697637]
test_accuracy [0.78, 0.79, 0.78, 0.8, 0.79]
train_accuracy [0.81, 0.8, 0.8, 0.8, 0.8]


RandomForestClassifier(criterion='entropy', max_features=0.6500000000000001,
                       min_samples_leaf=18, min_samples_split=11)
fit_time [0.83077669 0.70607471 0.74004769 0.72306848 0.75301957]
score_time [0.03390694 0.02792549 0.02792645 0.03091788 0.02889109]
test_accuracy [0.77, 0.79, 0.79, 0.81, 0.8]
train_accuracy [0.81, 0.81, 0.81, 0.81, 0.81]


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
      

# Use numbers instead of categories

In [55]:
def calculate_features4(df: pd.DataFrame) -> pd.DataFrame:
    def map_age(df):
        df.Age = df.Age.fillna(28)

    def map_cryo_sleep(df):
        df["IsCryoSleep"] = df["CryoSleep"].apply(lambda x: 1 if x else 0)

    def map_cabin(df):
        df["Cabin_deck"] = df["Cabin"].apply(lambda x: x.split("/")[0] if type(x) == str else 'UNKNOWN')
        df["Cabin_num"] = df["Cabin"].apply(lambda x: x.split("/")[1] if type(x) == str else 0)
        #df["Cabin_side"] = df["Cabin"].apply(lambda x: x.split("/")[2] if type(x) == str else 'UNKNOWN')
        df["Cabin_side_is_port"] = df["Cabin"].apply(lambda x: (1 if x.split("/")[2] == 'P' else 0) if type(x) == str else 0)

    def map_vip(df):
        df["IsVIP"] = df["VIP"].apply(lambda x: 1 if x else 0)

    def map_numbers(df, column: str):
        df[column].fillna(np.mean(df[column]), inplace=True)

    numbers_features = [
        "RoomService",
        "FoodCourt",
        "ShoppingMall",
        "Spa",
        "VRDeck",
    ]

    mapped_features = [
        "IsCryoSleep",
        "Age",
        "IsVIP",
        #"Cabin_num",
        "Cabin_side_is_port",
    ]

    dummy_features = [
        "HomePlanet",
        "Destination",
        "Cabin_deck",
    ]
    features = mapped_features + dummy_features + numbers_features

    df_copy = df.copy()
    map_age(df_copy)
    map_vip(df_copy)
    map_cryo_sleep(df_copy)
    map_cabin(df_copy)
    for c in numbers_features:
        map_numbers(df_copy, c)

    return pd.get_dummies(df_copy[features], columns=dummy_features)

In [56]:
calculate_features_and_train_and_evaluate_models(calculate_features4, ADVANCED_MODELS)

RandomForestClassifier()
fit_time [0.81299877 0.64029837 0.63933039 0.63727593 0.64524126]
score_time [0.03889012 0.04089069 0.03789902 0.04087806 0.037925  ]
test_accuracy [0.78, 0.79, 0.8, 0.79, 0.79]
train_accuracy [0.96, 0.96, 0.96, 0.96, 0.96]


GradientBoostingClassifier()
fit_time [0.83476496 0.81204486 0.80892754 0.82086062 0.83174539]
score_time [0.00694895 0.0069828  0.00595808 0.00501108 0.00498652]
test_accuracy [0.79, 0.79, 0.8, 0.82, 0.8]
train_accuracy [0.82, 0.82, 0.82, 0.81, 0.82]


RandomForestClassifier(criterion='entropy', max_features=0.6500000000000001,
                       min_samples_leaf=18, min_samples_split=11)
fit_time [1.17712474 1.17086172 1.15787101 1.20483208 1.13798118]
score_time [0.03094435 0.02889729 0.02994394 0.03192019 0.02892303]
test_accuracy [0.78, 0.79, 0.8, 0.81, 0.8]
train_accuracy [0.83, 0.83, 0.82, 0.82, 0.83]


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
     

# Fill missing values as 0 for numbers

In [91]:
def calculate_features5(df: pd.DataFrame) -> pd.DataFrame:
    def map_age(df):
        df.Age = df.Age.fillna(28)

    def map_cryo_sleep(df):
        df["IsCryoSleep"] = df["CryoSleep"].apply(lambda x: 1 if x else 0)

    def map_cabin(df):
        df["Cabin_deck"] = df["Cabin"].apply(lambda x: x.split("/")[0] if type(x) == str else 'UNKNOWN')
        df["Cabin_num"] = df["Cabin"].apply(lambda x: x.split("/")[1] if type(x) == str else 0)
        #df["Cabin_side"] = df["Cabin"].apply(lambda x: x.split("/")[2] if type(x) == str else 'UNKNOWN')
        df["Cabin_side_is_port"] = df["Cabin"].apply(lambda x: (1 if x.split("/")[2] == 'P' else 0) if type(x) == str else 0)

    def map_vip(df):
        df["IsVIP"] = df["VIP"].apply(lambda x: 1 if x else 0)

    def map_numbers(df, column: str):
        df[column].fillna(0, inplace=True)

    numbers_features = [
        "RoomService",
        "FoodCourt",
        "ShoppingMall",
        "Spa",
        "VRDeck",
    ]

    mapped_features = [
        "IsCryoSleep",
        "Age",
        "IsVIP",
        #"Cabin_num",
        "Cabin_side_is_port",
    ]

    dummy_features = [
        "HomePlanet",
        "Destination",
        "Cabin_deck",
    ]
    features = mapped_features + dummy_features + numbers_features

    df_copy = df.copy()
    map_age(df_copy)
    map_vip(df_copy)
    map_cryo_sleep(df_copy)
    map_cabin(df_copy)
    for c in numbers_features:
        map_numbers(df_copy, c)

    return pd.get_dummies(df_copy[features], columns=dummy_features)

In [92]:
calculate_features_and_train_and_evaluate_models(calculate_features5, ADVANCED_MODELS)

RandomForestClassifier()
test_accuracy [0.774, 0.781, 0.796, 0.793, 0.796]
train_accuracy [0.956, 0.956, 0.95, 0.95, 0.952]
[('Age', 0.17657344504395878), ('Spa', 0.1274962322106527), ('RoomService', 0.11267681382008955), ('VRDeck', 0.11001930587929913), ('IsCryoSleep', 0.10147819698811594), ('FoodCourt', 0.0992217493007079), ('ShoppingMall', 0.08987194086284706), ('Cabin_side_is_port', 0.02692721597253215), ('HomePlanet_Earth', 0.019609862035034554), ('HomePlanet_Europa', 0.017344347351772756), ('Cabin_deck_E', 0.01639477289433574), ('Cabin_deck_F', 0.01559463791116617), ('Cabin_deck_G', 0.014263476793453644), ('Destination_TRAPPIST-1e', 0.012690577829650858), ('HomePlanet_Mars', 0.012122174858481794), ('Destination_55 Cancri e', 0.010076206198743744), ('IsVIP', 0.007539419007381119), ('Cabin_deck_B', 0.007452830700262991), ('Destination_PSO J318.5-22', 0.0072891586044179476), ('Cabin_deck_C', 0.006060200232855393), ('Cabin_deck_D', 0.004253913834580959), ('Cabin_deck_UNKNOWN', 0.0027

# Remove some features

In [87]:
def calculate_features6(df: pd.DataFrame) -> pd.DataFrame:
    def map_age(df):
        df.Age = df.Age.fillna(28)

    def map_cryo_sleep(df):
        df["IsCryoSleep"] = df["CryoSleep"].apply(lambda x: 1 if x else 0)

    def map_cabin(df):
        df["Cabin_deck"] = df["Cabin"].apply(lambda x: x.split("/")[0] if type(x) == str else 'UNKNOWN')
        df["Cabin_num"] = df["Cabin"].apply(lambda x: x.split("/")[1] if type(x) == str else 0)
        #df["Cabin_side"] = df["Cabin"].apply(lambda x: x.split("/")[2] if type(x) == str else 'UNKNOWN')
        df["Cabin_side_is_port"] = df["Cabin"].apply(lambda x: (1 if x.split("/")[2] == 'P' else 0) if type(x) == str else 0)

    def map_vip(df):
        df["IsVIP"] = df["VIP"].apply(lambda x: 1 if x else 0)

    def map_numbers(df, column: str):
        df[column].fillna(0, inplace=True)

    numbers_features = [
        "RoomService",
        "FoodCourt",
        "ShoppingMall",
        "Spa",
        "VRDeck",
    ]

    mapped_features = [
        "IsCryoSleep",
        "Age",
        #"IsVIP",
        #"Cabin_num",
        "Cabin_side_is_port",
    ]

    dummy_features = [
        # "HomePlanet",
        #"Destination",
        "Cabin_deck",
    ]
    features = mapped_features + dummy_features + numbers_features

    df_copy = df.copy()
    map_age(df_copy)
    map_vip(df_copy)
    map_cryo_sleep(df_copy)
    map_cabin(df_copy)
    for c in numbers_features:
        map_numbers(df_copy, c)

    return pd.get_dummies(df_copy[features], columns=dummy_features)

In [88]:
calculate_features_and_train_and_evaluate_models(calculate_features6, ADVANCED_MODELS)

RandomForestClassifier()
test_accuracy [0.773, 0.777, 0.79, 0.799, 0.785]
train_accuracy [0.937, 0.939, 0.932, 0.931, 0.936]
[('Age', 0.16055714833445697), ('Spa', 0.14272857548569057), ('RoomService', 0.14057050965429477), ('VRDeck', 0.128020366459228), ('FoodCourt', 0.11324327661702899), ('ShoppingMall', 0.09784455635217958), ('IsCryoSleep', 0.0969832061224997), ('Cabin_deck_G', 0.026954092879298557), ('Cabin_side_is_port', 0.025148250098383746), ('Cabin_deck_F', 0.015776210252944796), ('Cabin_deck_E', 0.015399171480460218), ('Cabin_deck_C', 0.012629033701270309), ('Cabin_deck_B', 0.011854902521568584), ('Cabin_deck_D', 0.005357099967368431), ('Cabin_deck_A', 0.0034910447903985214), ('Cabin_deck_UNKNOWN', 0.0033911176655547414), ('Cabin_deck_T', 5.143761737355085e-05)]


GradientBoostingClassifier()
test_accuracy [0.78, 0.792, 0.794, 0.812, 0.796]
train_accuracy [0.818, 0.817, 0.815, 0.811, 0.817]
[('IsCryoSleep', 0.38357926622699845), ('Spa', 0.1233323139929885), ('VRDeck', 0.119456