In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import missingno
import matplotlib.pyplot as plt
import eli5
import catboost

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data preparation

In [None]:
# load all data available
train_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
sample_submission = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

print('train_data', train_data.shape, 
      'test_data', test_data.shape, 
      'sample_submission',sample_submission.shape )

In [None]:
# connect together train and test data to process all columns in a same way
data = pd.concat([train_data, test_data])

display(data.head())

data.info()

In [None]:
missingno.matrix(data);

In [None]:
# function to fill NaN in series wth random non NaN value
def fill_with_random(series: pd.Series):
    
    rng = np.random.default_rng(seed=42)
    
    series2 = series.copy()
    series2 = series2.apply(
        lambda x: rng.choice(series2.dropna().values) if x!=x or x is None else x)
    return series2

In [None]:
# fill empties with first value within group column
def fill_group_forward(data, column, group):
    
    grp = data.groupby(group)[column].first()
    
    def fill(row):
        if row[column] is None:
            return grp[grp.index==row[group]].values[0]
        else:
            return row[column] 
    
    data[column] = data.apply(fill, axis=1)
    data[column] = fill_with_random(data[column])
    
    return data

---
**Passenger ID**

In [None]:
# split passenger id to its group id and place in a group
data['group_id'] = data['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
data['num_in_group'] = data['PassengerId'].apply(lambda x: x.split('_')[1]).astype(int)

print('Groups total:', data['group_id'].nunique())
print('Persons in group:')
sns.histplot(data['num_in_group']);

---
**Home Planet**

In [None]:
# fill home planet with random values in a same percentage, as existing data
print(data['HomePlanet'].value_counts(dropna=False), data['HomePlanet'].shape)
data = fill_group_forward(data, column='HomePlanet', group='group_id') # fill_with_random(data['HomePlanet'])
sns.histplot(data['HomePlanet']);

---
**Cabin**

In [None]:
data = fill_group_forward(data, column='Cabin', group='group_id')

In [None]:
def split_cabin_code(x: str,n):
    try:
        split = x.split('/')
    except:
        return None 

    return split[n]


# extract specific featires from cabin description
data['cabin_deck'] = data['Cabin'].apply(lambda x: split_cabin_code(x,0)) 
data['cabin_num'] = data['Cabin'].apply(lambda x: split_cabin_code(x,1)).astype(int)
data['cabin_side'] = data['Cabin'].apply(lambda x: split_cabin_code(x,2)) 

fig, axs = plt.subplots(1,3, figsize=(15,5))
sns.histplot(data['cabin_deck'], ax=axs[0])
sns.histplot(data['cabin_num'], ax=axs[1])
sns.histplot(data['cabin_side'], ax=axs[2])

---
**CryoSleep**


In [None]:
# share of sleepers at dufferent decks and ship sides
sns.heatmap(data.pivot_table(index='cabin_deck', columns='cabin_side', values='CryoSleep', aggfunc='mean') ,annot=True);

In [None]:
print(data['CryoSleep'].value_counts(dropna=False))
data = fill_group_forward(data, column='CryoSleep', group='group_id')
sns.histplot(data['CryoSleep']);

---
**Destination**

In [None]:
print(data['Destination'].value_counts(dropna=False))
data = fill_group_forward(data, column='Destination', group='group_id')
sns.histplot(data['Destination']);

---
**Age**

In [None]:
# fill age randomly
print(data['Age'].value_counts(dropna=False))
data['Age']= fill_with_random(data['Age'])
sns.histplot(data['Age']);

---
**VIP**

In [None]:
print(data['VIP'].value_counts(dropna=False))

# check percentage of VIP on different decks and cabin sides
deck_to_vip = data.pivot_table(index='cabin_deck', values='VIP', aggfunc='mean')
sns.heatmap(deck_to_vip)
plt.show()

# fill VIP status randomly
for deck in data['cabin_deck'].unique():
    # fill subset of passengers in deck/side
    data.loc[(data['cabin_deck'] == deck), 'VIP'] = (
            fill_with_random(data.loc[(data['cabin_deck'] == deck)]['VIP'])
        )
    
data['VIP']=data['VIP'].astype(bool)

---
**RoomService, FoodCourt, ShoppingMall, Spa, VRDeck**

`RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck` - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

In [None]:
for col in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    fig = plt.figure(figsize=(15,0.5))
    sns.boxplot(data.loc[data[col]>0][[col]], x=col)

In [None]:
fig, axs = plt.subplots(1,3, figsize=(15,5))

# check some correlations between bills and different possibly affecting factors
deck_to_bill = data.pivot_table(index='cabin_deck', 
                                values=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], aggfunc='mean')
sns.heatmap(deck_to_bill, annot=True, fmt=".0f", ax=axs[0])

vip_to_bill = data.pivot_table(index='VIP', values=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], aggfunc='mean')
sns.heatmap(vip_to_bill, annot=True, fmt=".0f", ax=axs[1])

age_to_bill = data.pivot_table(index='Age', values=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], aggfunc='mean')
sns.heatmap(age_to_bill, ax=axs[2])
plt.show()

In [None]:
data = data.fillna({'RoomService':0, 'FoodCourt':0, 'ShoppingMall':0, 'Spa':0, 'VRDeck':0})

---
**Name**

In [None]:
def split_full_name(x: str,n):
    try:
        split = x.split(' ')
    except:
        return None 

    return split[n]

# extract name and family name
data['first_name'] = data['Name'].apply(lambda x: split_full_name(x,0)) 
data['last_name'] = data['Name'].apply(lambda x: split_full_name(x,1))

# fill gaps with 
data['first_name'] = fill_with_random(data['first_name'])
data = fill_group_forward(data, column='last_name', group='group_id')

**Finalize**

In [None]:
data.info()

In [None]:
data.set_index('PassengerId', inplace=True)

## Make baseline submission

In [None]:
def split_data(data, target = 'Transported'):
    
    # prepare data for train, validation and submission
    x = data.drop(columns=target)  
    y = data[target]  
    
    # drop text columns
    for c in x.columns:
        if x[c].dtype =='object':
            x.drop(columns=c, inplace=True)

    # extract train data
    x_train = x[~y.isna()]
    y_train = y[~y.isna()].astype(int)

    #extract ubmission data
    x_test = x[y.isna()]

    return x_train, y_train, x_test

In [None]:
def train_and_predict(model, data, 
                      new_feature_names=None, folds=10, scoring='accuracy',
                      top_n_features_to_show=30, submission_file_name='submission.csv', silent=False):
    
    (x_train, y_train, x_test) = data
    
    cv = StratifiedKFold(folds, shuffle=True, random_state=42)
    
    # make cross-validation
    cv_scores = cross_val_score(model, x_train, y_train, cv=cv, scoring=scoring, n_jobs=4)
    if not silent: print('CV scores', cv_scores)
    if not silent: print(f'CV mean:{cv_scores.mean():.4f}, CV std:{cv_scores.std():.4f}')
    
    # train model
    model.fit(x_train, y_train)

    # show feature importances
    if not silent:
        display(eli5.show_weights(estimator=model, 
                  feature_names=x_train.columns.to_list(), top=top_n_features_to_show))
 
    # print new features stats
    if new_feature_names:
        print('New feature weights:')
        try:
            print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
        except:
            pass
        
    # make submission
    preds = model.predict(x_test)
    preds = pd.DataFrame(preds, index=x_test.index).astype(bool)
    preds.columns=['Transported']
    
    # save submission file
    submission = sample_submission.drop(columns='Transported').\
        merge(preds.reset_index(), how='left', on='PassengerId')
    
    submission.to_csv(submission_file_name, index=False)
    
    return cv_scores

In [None]:
catreg = catboost.CatBoostClassifier(random_state=42, verbose=False)

cv_scores1 = train_and_predict(catreg, split_data(data), submission_file_name='submission.csv')

## Feature engineering

In [None]:
def compare_cv_scores(cv_score_old, cv_score_new):
    
    folds_compare = cv_score_new > cv_score_old
    print('\nFolds compare:', folds_compare, end='\n\n')
    
    if cv_score_new.mean() > cv_score_old.mean():
        print('Score increased \t[GOOD]', end='')
    else:
        print('Score decreased \t[BAD]', end='')
    print(f'\t{cv_score_old.mean():.4f} -> {cv_score_new.mean():.4f}',
          f'{cv_score_new.mean() - cv_score_old.mean():.4f}')
        
    if cv_score_new.std()>cv_score_old.std():
        print('Variation increased \t[BAD]', end='')
    else:
        print('Variation decreased \t[GOOD]', end='')    
    print(f'\t{cv_score_old.std():.4f} -> {cv_score_new.std():.4f}',
          f'{cv_score_new.std() - cv_score_old.std():.4f}')

In [None]:
# encode home planet
def add_homeplanet_one(data):
    data = data.join(pd.get_dummies(data['HomePlanet'], prefix='home', drop_first=True))
    return data

data = add_homeplanet_one(data)

In [None]:
# encode cabin side 
def add_cabin_side(data):
    data['cabin_side'] = data['cabin_side'].map({'S': 1, 'P': 0}).astype(int)
    return data

data = add_cabin_side(data)

In [None]:
# check if person single or not
def add_group_size(data):
    group_sizes = data['group_id'].value_counts().reset_index()
    group_sizes.columns=['group_id','group_size']
    data = data.reset_index().merge(group_sizes, how='left', on='group_id').set_index('PassengerId')
    
    def categorize_size(x):
        if x<=1: # single
            return 1
        elif x<=2: #couple
            return 2
        else:
            return 3
    
    data['group_size'] = data['group_size'].apply(categorize_size).astype(int)
    
    return data

data = add_group_size(data)

In [None]:
def add_deck_bill(data):
    # mean bill on a deck
    data['deck_mean_bill']=data['cabin_deck'].map({'A': 3331, 'B': 2927, 'C': 3937, 'D': 2296, 
                                                   'E': 1343, 'F': 1001, 'G': 408, 'T': 5916,  }).astype(int) 

    return data

data = add_deck_bill(data)

In [None]:
def add_weighted_bills(data):
    money_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    data['total_bill'] = data[['RoomService', 'FoodCourt', 
                               'ShoppingMall', 'Spa', 
                               'VRDeck']].apply(lambda x: 0.1 if x.sum()==0 else x.sum(), axis=1).astype(int)
    for col in money_cols:
        data[col+'_w'] = data[col]/data['total_bill']
    
    
    data['total_bill'] = (data['total_bill'] - data['total_bill'].mean())/ data['total_bill'].std()
    
    return data

data = add_weighted_bills(data)

## Submission

In [None]:
cv_scores2 = train_and_predict(catreg, split_data(data), submission_file_name='submission.csv' )
compare_cv_scores(cv_scores1, cv_scores2)
cv_scores1 = cv_scores2

<div class="alert alert-block alert-success">

**0.80827** Public score
    
    Folds compare: [ True  True  True  True False  True  True  True  True  True]

    Score increased        [GOOD]  0.7996 -> 0.8107  0.0110
    Variation decreased    [GOOD]  0.0166 -> 0.0138 -0.0028