In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

warnings.filterwarnings('ignore')

data_path = '../input/spaceship-titanic/train.csv'
X = pd.read_csv(data_path)
y = X.Transported
X_test = pd.read_csv('../input/spaceship-titanic/test.csv')
X = X.drop('Transported', axis=1)

# Merge the splits so we can process them together
df = pd.concat([X, X_test])

In [None]:
df[['group','number']] = (
    df['PassengerId']
    .str
    .split('_', expand=True)
)
df[['deck','num','side']] = (
    df['Cabin']
    .str
    .split('/', expand=True)
)
# df[['F_Name','L_Name']] = (
#     df['Name']
#     .str
#     .split(' ', expand=True)
# )
df['num_in_group'] = df.groupby('group').group.transform("count")
# df['group_age'] = df.groupby('group').Age.transform("mean")

In [None]:
df = df.drop('Name', axis=1).drop('group', axis=1).drop('Cabin', axis=1)

In [None]:
amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['amenities'] = df[amenities].gt(0).sum(axis=1)
df['in_group'] = df['num_in_group'].gt(1)
df[['number','num']] = df[['number','num']].astype('float64')
df['sum_purchase'] = df[amenities].sum(axis=1)
df['group_mean_purchase'] = df[amenities].sum(axis=1)/df.num_in_group
# df['deck_mean_purchase'] = df.groupby('deck')[amenities].transform("mean").sum(axis=1)
# df['RoomService+'] = df['RoomService'].gt(0)

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split

df = df.set_index('PassengerId')
X = X.set_index('PassengerId')
X_test = X_test.set_index('PassengerId')

df_train = df.loc[X.index, :]
df_test = df.loc[X_test.index, :]

X_train, X_valid,y_train,y_valid = train_test_split(df_train, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [cname for cname in df.columns if df[cname].dtype in ['int64','float64']]
# my_cols = cat_cols + num_cols

In [None]:
from sklearn.preprocessing import OrdinalEncoder
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OrdinalEncoder(handle_unknown='error'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

In [None]:
X_train

In [None]:
from xgboost.sklearn import XGBClassifier

model = XGBClassifier()

In [None]:
from scipy import stats
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,StratifiedKFold

my_pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                             ('classifier', XGBClassifier(n_jobs=16, n_estimators=20))
                             ])
# Cross validation: 60 iterations with 3 fold CV.

n_features_after_transform = my_pipeline.named_steps.preprocessor.fit_transform(X_train).shape[1]

param_grid = {
    'classifier__max_depth':stats.randint(low=2, high=100),
    'classifier__max_features':stats.randint(low=2, high=n_features_after_transform),
    'classifier__gamma':stats.uniform.rvs(0, 0.25, size=10000),
    'classifier__subsample':stats.uniform.rvs(0.5, 0.5, size=10000),
    'classifier__reg_alpha':stats.uniform.rvs(0.5, 1., size=10000),
    'classifier__reg_lambda':stats.uniform.rvs(0.5, 1., size=10000)
}

rscv = RandomizedSearchCV(
    my_pipeline,
    param_grid,
    n_iter=60,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=3, shuffle=True)

)

rscv.fit(X_train, y_train)


# In[42]:


# Set the tuned best params and beef up the number of estimators.

my_pipeline.set_params(**rscv.best_params_)
my_pipeline.named_steps.classifier.set_params(n_estimators=200)  

In [None]:
#pd.DataFrame.from_dict(rscv.cv_results_)

In [None]:
# from sklearn import cross_validation, metrics

preds=rscv.predict(X_valid)

In [None]:
from sklearn.metrics import roc_auc_score
m = roc_auc_score(y_valid,preds)

In [None]:
m

In [None]:
test_preds =  rscv.predict(df_test)

# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.

output = pd.DataFrame({'PassengerId': X_test.index,
                       'Transported': test_preds})
output.to_csv('submission.csv', index=False)