In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

warnings.filterwarnings('ignore')

data_path = 'train.csv'
X = pd.read_csv(data_path)
y = X.Transported
# X_test = pd.read_csv('../input/spaceship-titanic/test.csv')
X = X.drop('Transported', axis=1)

# Merge the splits so we can process them together
df = X

In [4]:
df[['group','number']] = (
    df['PassengerId']
    .str
    .split('_', expand=True)
)
df[['deck','num','side']] = (
    df['Cabin']
    .str
    .split('/', expand=True)
)
# df[['F_Name','L_Name']] = (
#     df['Name']
#     .str
#     .split(' ', expand=True)
# )
df['num_in_group'] = df.groupby('group').group.transform("count")
# df['group_age'] = df.groupby('group').Age.transform("mean")

In [5]:
df = df.drop('Name', axis=1).drop('group', axis=1).drop('Cabin', axis=1)

In [6]:
amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['amenities'] = df[amenities].gt(0).sum(axis=1)
df['in_group'] = df['num_in_group'].gt(1)
df[['number','num']] = df[['number','num']].astype('float64')
df['sum_purchase'] = df[amenities].sum(axis=1)
df['group_mean_purchase'] = df[amenities].sum(axis=1)/df.num_in_group
# df['deck_mean_purchase'] = df.groupby('deck')[amenities].transform("mean").sum(axis=1)
# df['RoomService+'] = df['RoomService'].gt(0)

In [7]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,number,deck,num,side,num_in_group,amenities,in_group,sum_purchase,group_mean_purchase
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,1.0,B,0.0,P,1,0,False,0.0,0.0
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1.0,F,0.0,S,1,5,False,736.0,736.0
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,1.0,A,0.0,S,2,4,True,10383.0,5191.5
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,2.0,A,0.0,S,2,4,True,5176.0,2588.0
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1.0,F,1.0,S,1,5,False,1091.0,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,1.0,A,98.0,P,1,3,False,8536.0,8536.0
8689,9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,1.0,G,1499.0,S,1,0,False,0.0,0.0
8690,9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,1.0,G,1500.0,S,1,2,False,1873.0,1873.0
8691,9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,1.0,E,608.0,S,2,3,True,4637.0,2318.5


In [8]:
from sklearn.model_selection import train_test_split

df = df.set_index('PassengerId')
X = X.set_index('PassengerId')
# X_test = X_test.set_index('PassengerId')

df_train = df.loc[X.index, :]
# df_test = df.loc[X_test.index, :]

X_train, X_valid,y_train,y_valid = train_test_split(df_train, y, train_size=0.8, test_size=0.2, random_state=0)

In [9]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [cname for cname in df.columns if df[cname].dtype in ['int64','float64']]
# my_cols = cat_cols + num_cols

In [10]:
from sklearn.preprocessing import OrdinalEncoder
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OrdinalEncoder(handle_unknown='error'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

In [11]:
X_train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,number,deck,num,side,num_in_group,amenities,in_group,sum_purchase,group_mean_purchase
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
4558_01,Europa,False,55 Cancri e,54.0,False,0.0,559.0,0.0,15238.0,2799.0,1.0,C,167.0,S,1,3,False,18596.0,18596.000000
6326_01,Earth,False,TRAPPIST-1e,20.0,False,0.0,20.0,1.0,696.0,0.0,1.0,F,1307.0,P,1,3,False,717.0,717.000000
0503_02,Mars,False,TRAPPIST-1e,43.0,False,1821.0,0.0,47.0,29.0,0.0,2.0,F,90.0,S,3,3,True,1897.0,632.333333
4757_01,Earth,False,TRAPPIST-1e,24.0,False,185.0,0.0,476.0,1810.0,53.0,1.0,F,896.0,S,1,4,False,2524.0,2524.000000
9046_01,Europa,True,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,1.0,C,335.0,S,2,0,True,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4655_01,Europa,True,55 Cancri e,32.0,False,0.0,0.0,0.0,0.0,0.0,1.0,B,154.0,P,2,0,True,0.0,0.000000
8423_01,Earth,False,TRAPPIST-1e,22.0,False,0.0,0.0,6.0,0.0,733.0,1.0,F,1620.0,S,1,2,False,739.0,739.000000
5185_01,Mars,False,TRAPPIST-1e,29.0,False,523.0,0.0,21.0,4.0,811.0,1.0,E,330.0,S,1,4,False,1359.0,1359.000000
3499_04,Earth,False,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0,4.0,G,574.0,P,4,0,True,0.0,0.000000


In [13]:
from xgboost.sklearn import XGBClassifier

model = XGBClassifier()

In [14]:
from scipy import stats
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,StratifiedKFold

my_pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                             ('classifier', XGBClassifier(n_jobs=16, n_estimators=20))
                             ])
# Cross validation: 60 iterations with 3 fold CV.

n_features_after_transform = my_pipeline.named_steps.preprocessor.fit_transform(X_train).shape[1]

param_grid = {
    'classifier__max_depth':stats.randint(low=2, high=100),
    'classifier__max_features':stats.randint(low=2, high=n_features_after_transform),
    'classifier__gamma':stats.uniform.rvs(0, 0.25, size=10000),
    'classifier__subsample':stats.uniform.rvs(0.5, 0.5, size=10000),
    'classifier__reg_alpha':stats.uniform.rvs(0.5, 1., size=10000),
    'classifier__reg_lambda':stats.uniform.rvs(0.5, 1., size=10000)
}

rscv = RandomizedSearchCV(
    my_pipeline,
    param_grid,
    n_iter=60,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=3, shuffle=True)

)

rscv.fit(X_train, y_train)


# In[42]:


# Set the tuned best params and beef up the number of estimators.

my_pipeline.set_params(**rscv.best_params_)
my_pipeline.named_steps.classifier.set_params(n_estimators=200)  

Parameters: { "max_features" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_features" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_features" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_features" } might not be

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=0.00045506420710386686,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=4,
              max_features=14, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=200, n_jobs=16,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=1.4345923341839264, reg_lambda=0.7242686985118149,
              scale_pos_weight=None, subsample=0.8183660265502659,
              tree_method=None, validate_parameters=None, verbosity=None)

In [None]:
#pd.DataFrame.from_dict(rscv.cv_results_)

In [15]:
# from sklearn import cross_validation, metrics

preds=rscv.predict(X_valid)

In [16]:
from sklearn.metrics import roc_auc_score
m = roc_auc_score(y_valid,preds)

In [17]:
m

0.8024955422572844

In [None]:
# test_preds =  rscv.predict(df_test)

# # The lines below shows how to save predictions in format used for competition scoring
# # Just uncomment them.

# output = pd.DataFrame({'PassengerId': X_test.index,
#                        'Transported': test_preds})
# output.to_csv('submission.csv', index=False)