In [2]:
#Link:https://www.kaggle.com/competitions/spaceship-titanic/leaderboard

import pandas as pd
import numpy as np
import torch

import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn import preprocessing

def convert_data(ofus_matrix):

    matrix = np.zeros(ofus_matrix.shape) -1

    for i in range(ofus_matrix.shape[1]):
        attributes = ofus_matrix[:,i]
        if ( isinstance(attributes[0], (int, float)) == False):
            le = LabelEncoder()
            le.fit(attributes)
            matrix[:,i] = le.transform(attributes)
        else:
            matrix[:,i] = np.nan_to_num(attributes)
    return matrix

def normalize(ofus_matrix):
    
    matrix = np.zeros(ofus_matrix.shape) -1
    for i in range(ofus_matrix.shape[1]):
        attributes = ofus_matrix[:,i]
        scaler = preprocessing.MinMaxScaler()
        matrix[:,i]=scaler.fit_transform(attributes.reshape([-1,1])).reshape(attributes.shape)
    return matrix

def convert(train,test):
    r, c = train.shape
    rt, ct = test.shape
    result = pd.concat([train,test])
    new_result= convert_data(result.to_numpy())
    #new_result = normalize(new_result)
    new_train =new_result[0:r,]
    new_test = new_result[r:r+rt,]
    
    return (new_train,new_test)
    

def pre (train,test) :
    numbers_train = train.select_dtypes(np.number)
    object_train = train.select_dtypes(exclude=[np.number]) #bool and strings
    
    #fill nans with moda and median TRAIN
    for numbers in numbers_train:
         train.loc[:,numbers] = train.loc[:,numbers].fillna(train.loc[:,numbers].median())
            
    for obj in object_train:
         train.loc[:,obj] = train.loc[:,obj].fillna(train.loc[:,obj].mode()[0])
            
            
    numbers_test = test.select_dtypes(np.number)
    object_test = test.select_dtypes(exclude=[np.number]) #bool and strings
    
    #fill nans with moda and median TRAIN
    for numbers in numbers_test:
         test.loc[:,numbers] = test.loc[:,numbers].fillna(test.loc[:,numbers].median())

    for obj in object_test:
         test.loc[:,obj] = test.loc[:,obj].fillna(test.loc[:,obj].mode()[0])
    
    # Adding group and number columns to test and train
    
    train['group'] = train.PassengerId.str.extract( r'(\d+)\_', expand=False)
    train['number'] = train.PassengerId.str.extract( r'\_(\d+)', expand=False)

    test['group'] = test.PassengerId.str.extract( r'(\d+)\_', expand=False)
    test['number'] = test.PassengerId.str.extract( r'\_(\d+)', expand=False)
    
    train = train.drop(['PassengerId'], axis=1)
    test = test.drop(['PassengerId'], axis=1)
    
    #adding deck,num and side columns to test and train
    
    train['deck'] = train.Cabin.str.extract( '([A-Za-z]+)\/', expand=False)
    train['num'] = train.Cabin.str.extract( r'(\d+)', expand=False)
    train['side'] = train.Cabin.str.extract( '\/([A-Za-z]+)', expand=False)
    
    test['deck'] = test.Cabin.str.extract( '([A-Za-z]+)\/', expand=False)
    test['num'] = test.Cabin.str.extract( r'(\d+)', expand=False)
    test['side'] = test.Cabin.str.extract( '\/([A-Za-z]+)', expand=False)

    train = train.drop(['Cabin'], axis=1)
    test = test.drop(['Cabin'], axis=1)
    
    #NAME
    train['Name'] = train.Name.str.extract( '(\ [A-Za-z]+)', expand=False)
    test['Name'] = test.Name.str.extract( '(\ [A-Za-z]+)', expand=False)
    
    #AGE 
    
    train.loc[train['Age']<22,'Age'] = 1
    train.loc[ (train['Age']>=22) & (train['Age']<=33) ,'Age'] = 2
    train.loc[ (train['Age']>33)  ,'Age'] = 3
    train['Age'] = train['Age'].astype(int)

    test.loc[test['Age']<22,'Age'] = 1
    test.loc[ (test['Age']>=22) & (test['Age']<=33) ,'Age'] = 2
    test.loc[ (test['Age']>33) ,'Age'] = 3
    test['Age'] = test['Age'].astype(int)

    """
    #Roomservice
    
    #erase transported column
    train_aux = train.copy()
    train_aux  = train_aux.drop(['Transported'], axis=1)

    #join all data
    #result = pd.concat([train_aux,test])

    train.loc[train['RoomService']<1.0,'RoomService'] = 0
    train.loc[train['RoomService']>=1.0 ,'RoomService'] = 1
    
    test.loc[test['RoomService']<1.0,'RoomService'] = 0
    test.loc[test['RoomService']>=1.0 ,'RoomService'] = 1
    
    train['RoomService'] = train['RoomService'].astype(int)
    test['RoomService'] = test['RoomService'].astype(int)
    
    #FoodCourt
    
    train.loc[train['FoodCourt']<2.0,'FoodCourt'] = 0
    train.loc[train['FoodCourt']>=2.0 ,'FoodCourt'] = 1
    
    test.loc[test['FoodCourt']<2.0,'FoodCourt'] = 0
    test.loc[test['FoodCourt']>=2.0 ,'FoodCourt'] = 1
    
    train['FoodCourt'] = train['FoodCourt'].astype(int)
    test['FoodCourt'] = test['FoodCourt'].astype(int)
    
    #ShoppingMall
    train.loc[train['ShoppingMall']<0.667,'ShoppingMall'] = 0
    train.loc[train['ShoppingMall']>=0.667 ,'ShoppingMall'] = 1
    
    test.loc[test['ShoppingMall']<0.667,'ShoppingMall'] = 0
    test.loc[test['ShoppingMall']>=0.667 ,'ShoppingMall'] = 1
    
    train['ShoppingMall'] = train['ShoppingMall'].astype(int)
    test['ShoppingMall'] = test['ShoppingMall'].astype(int)
    
    #Spa
    
    train.loc[train['Spa']<3,'Spa'] = 0
    train.loc[train['Spa']>=3 ,'Spa'] = 1
    
    test.loc[test['Spa']<3,'Spa'] = 0
    test.loc[test['Spa']>=3,'Spa'] = 1
    
    train['Spa'] = train['Spa'].astype(int)
    test['Spa'] = test['Spa'].astype(int)
    
    #VRDeck
    train.loc[train['VRDeck']<1,'VRDeck'] = 0
    train.loc[train['VRDeck']>=1 ,'VRDeck'] = 1
    
    test.loc[test['VRDeck']<1,'VRDeck'] = 0
    test.loc[test['VRDeck']>=1 ,'VRDeck'] = 1
    
    train['VRDeck'] = train['VRDeck'].astype(int)
    test['VRDeck'] = test['VRDeck'].astype(int)
    
    """
    return (train,test)


def fine_tune_xgb_sklearn(X_train, Y_train, seed):
    RS_CV = 5
    RS_N_ITER = 2
    RS_N_JOBS = -1
    n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 30)]
                    
    
    random_grid = {
                 'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
                 'max_depth' : [ 3, 4, 5, 6, 8, 10,15],
                 'min_child_weight' : [ 1, 3, 5, 7,9],
                 'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ,1],
                 'colsample_bytree': [ 0.3, 0.4, 0.5 ,0.6,0.7,1],
                 'n_estimators': n_estimators}
    
     
    clf_xgb = XGBClassifier(#n_estimators =3000,
                            verbosity=1,
                            objective='binary:logistic',
                            booster='gbtree',
                            #n_jobs=-1,
                            #nthread=None,
                            #max_delta_step=0,
                            subsample=0.7,
                            #colsample_bylevel=1,
                            #colsample_bynode=1,
                            #reg_alpha=0,
                            #reg_lambda=1,
                            #scale_pos_weight=1,
                            #base_score=0.5,
                            #random_state=0,
                            verbose=0,
                            #seed=None
                           )

    
    clf_random = RandomizedSearchCV(estimator = clf_xgb,
                                    param_distributions = random_grid,
                                    n_iter = RS_N_ITER, 
                                    cv = RS_CV,
                                    verbose=0,
                                    random_state=seed, n_jobs = RS_N_JOBS)
    
    clf_random.fit(X_train, Y_train.astype(int))
    print(clf_random.best_params_)
    return clf_random

def tab_net(trainx,trainy,seed):
    X_train, X_valid, Y_train,Y_valid = train_test_split(trainx, trainy, test_size = 0.25, random_state =seed,stratify=trainy)
    
    tb_cls = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                               optimizer_params=dict(lr=1e-3),
                               scheduler_params={"step_size":10, "gamma":0.9},
                               scheduler_fn=torch.optim.lr_scheduler.StepLR,
                               verbose=1,
                               seed=seed,
                               mask_type='entmax' # "sparsemax" entmax
                               )

    tb_cls.fit(X_train,Y_train,
                               eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
                               eval_name=['train', 'valid'],
                               eval_metric=['auc'],
                               max_epochs=100 , patience=10,
                               batch_size=32, drop_last=False)    
    return tb_cls

def fine_tune_RCF_sklearn(X_train, Y_train, seed):
    n_estimators = [int(x) for x in np.linspace(start = 400, stop = 700, num = 14)]
    max_features = ['sqrt','log2']
    max_depth = [int(x) for x in np.linspace(10, 55, num = 10)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
    
    clf = RandomForestClassifier(random_state = seed)
    clf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 2, cv = 5, verbose=2, random_state=seed, n_jobs = -1)
    clf_random.fit(X_train, Y_train.astype(int))
    print(clf_random.best_params_)
    return clf_random
    
def fine_tune_cat_sklearn(X_train, Y_train, seed):
    
    RS_CV = 5 
    RS_N_ITER = 2
    RS_N_JOBS = -1

    iterations = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 5)]
    
    random_grid = {
                   'depth':[8,10],
                   'learning_rate': [0.1,0.01],
                   'iterations': iterations
                   }
    
    clf = CatBoostClassifier(
                            #iterations=1000,         # Reduced iterations
                            l2_leaf_reg=3.0,         # Increased L2 regularization term
                            #eval_metric='Accuracy',
                            random_seed=seed,
                            verbose=0,
                            #loss_function ='accuracy'
                           )

    clf_random = RandomizedSearchCV(estimator = clf,
                                    param_distributions = random_grid,
                                    n_iter = RS_N_ITER, 
                                    cv = RS_CV,
                                    verbose=0, random_state=seed, n_jobs = RS_N_JOBS)
    
    clf_random.fit(X_train, Y_train.astype(int))
    print(clf_random.best_params_)
    return clf_random
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sub.csv')


(train,test) = pre(train,test)


Y_train = train['Transported'].to_numpy()
train  = train.drop(['Transported'], axis=1)

(X_train,X_test)=convert(train,test)

x=X_train.copy()
y=Y_train.copy()
seed = 13
X_train, X_valid, Y_train,Y_valid = train_test_split(X_train, Y_train, test_size = 0.1, random_state =seed,stratify=Y_train)

model1 = fine_tune_xgb_sklearn ( X_train,Y_train, seed)
model2 = tab_net(x,y,seed)
model3 = fine_tune_RCF_sklearn ( X_train,Y_train, seed)
model4 = fine_tune_cat_sklearn ( X_train,Y_train, seed)

print("XGBOOST")
print(model1.score(X_train,Y_train))
print(model1.score(X_valid,Y_valid))

print("Decision tree")
print(model3.score(X_train,Y_train))
print(model3.score(X_valid,Y_valid))

print("catBOOST")
print(model4.score(X_train,Y_train))
print(model4.score(X_valid,Y_valid))

result1= model1.predict(X_test)
result2= model2.predict(X_test)
result3= model3.predict(X_test)
result4= model4.predict(X_test)

submission['Transported'] =result1
submission['Transported'] =submission['Transported'].replace(1,True)
submission['Transported'] =submission['Transported'].replace(0,False)
name = 'submission01.csv'
submission.to_csv(name,index=False)

submission['Transported'] =result2
submission['Transported'] =submission['Transported'].replace(1,True)
submission['Transported'] =submission['Transported'].replace(0,False)
name = 'submission02.csv'
submission.to_csv(name,index=False)

submission['Transported'] =result3
submission['Transported'] =submission['Transported'].replace(1,True)
submission['Transported'] =submission['Transported'].replace(0,False)
name = 'submission03.csv'
submission.to_csv(name,index=False)

submission['Transported'] =result4
submission['Transported'] =submission['Transported'].replace(1,True)
submission['Transported'] =submission['Transported'].replace(0,False)
name = 'submission04.csv'
submission.to_csv(name,index=False)


Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


{'n_estimators': 453, 'min_child_weight': 1, 'max_depth': 15, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.7}




epoch 0  | loss: 0.70018 | train_auc: 0.70585 | valid_auc: 0.66829 |  0:00:05s
epoch 1  | loss: 0.61262 | train_auc: 0.75813 | valid_auc: 0.73304 |  0:00:11s
epoch 2  | loss: 0.57045 | train_auc: 0.8055  | valid_auc: 0.78089 |  0:00:16s
epoch 3  | loss: 0.54513 | train_auc: 0.82697 | valid_auc: 0.80113 |  0:00:21s
epoch 4  | loss: 0.53711 | train_auc: 0.84074 | valid_auc: 0.81472 |  0:00:27s
epoch 5  | loss: 0.51905 | train_auc: 0.8497  | valid_auc: 0.82351 |  0:00:32s
epoch 6  | loss: 0.51527 | train_auc: 0.85631 | valid_auc: 0.8305  |  0:00:38s
epoch 7  | loss: 0.4935  | train_auc: 0.86232 | valid_auc: 0.83723 |  0:00:43s
epoch 8  | loss: 0.49431 | train_auc: 0.86614 | valid_auc: 0.83787 |  0:00:48s
epoch 9  | loss: 0.48221 | train_auc: 0.8676  | valid_auc: 0.84162 |  0:00:54s
epoch 10 | loss: 0.4863  | train_auc: 0.86925 | valid_auc: 0.84364 |  0:01:00s
epoch 11 | loss: 0.4747  | train_auc: 0.87092 | valid_auc: 0.84584 |  0:01:05s
epoch 12 | loss: 0.47561 | train_auc: 0.87397 | vali



Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    8.0s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.4s finished


{'n_estimators': 446, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 15, 'bootstrap': True}




{'learning_rate': 0.01, 'iterations': 500, 'depth': 10}
XGBOOST
0.999616515403298
0.7781609195402299
Decision tree
0.9126933401508373
0.7942528735632184
catBOOST
0.8646299373641826
0.7793103448275862
