In [1]:
import pandas as pd
import numpy as np
import torch

import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.impute import KNNImputer
from sklearn import preprocessing


def convert_data(ofus_matrix):

    matrix = np.zeros(ofus_matrix.shape) -1

    for i in range(ofus_matrix.shape[1]):
        attributes = ofus_matrix[:,i]
        if ( isinstance(attributes[0], (int, float)) == False):
            le = LabelEncoder()
            le.fit(attributes)
            matrix[:,i] = le.transform(attributes)
        else:
            matrix[:,i] = np.nan_to_num(attributes)
    return matrix

def normalize(ofus_matrix):
    matrix = np.zeros(ofus_matrix.shape) -1
    
    for i in range(ofus_matrix.shape[1]):
        attributes = ofus_matrix[:,i]
        scaler = preprocessing.MinMaxScaler()
        matrix[:,i]=scaler.fit_transform(attributes.reshape([-1,1])).reshape(attributes.shape)
        
    return matrix

def convert(train,test):
    r, c = train.shape
    rt, ct = test.shape
    result = pd.concat([train,test])
    new_result= convert_data(result.to_numpy())
    new_result=normalize(new_result)
    #print(new_result.shape)
    new_train =new_result[0:r,]
    new_test = new_result[r:r+rt,]
    
    return (new_train,new_test)


def pre2 (train,test) :
    numbers_train = train.select_dtypes(np.number)
    cad_train = train.select_dtypes(exclude=[np.number])
    
    #age fill nans
    train['age_miss'] = train['Age'].isna().astype(int)
    test['age_miss'] = test['Age'].isna().astype(int)
    
    train.loc[train['Survived']==0,'Age'] = train.loc[train['Survived']==0,'Age'].fillna(train.loc[train['Survived']==0,'Age'].median())
    train.loc[train['Survived']==1,'Age'] = train.loc[train['Survived']==1,'Age'].fillna(train.loc[train['Survived']==1,'Age'].median())
    
    test.loc[:,'Age'] = test.loc[:,'Age'].fillna(test.loc[:,'Age'].median())
    
    #cabbin fill nans
    
    train['cabin_miss'] = train['Cabin'].isna().astype(int)
    test['cabin_miss'] = test['Cabin'].isna().astype(int)
    
    #train.loc[train['Survived']==0,'Cabin'] = train.loc[train['Survived']==0,'Cabin'].fillna(train.loc[train['Survived']==0,'Cabin'].mode()[0])
    #train.loc[train['Survived']==1,'Cabin'] = train.loc[train['Survived']==1,'Cabin'].fillna(train.loc[train['Survived']==1,'Cabin'].mode()[0])
    
    train.loc[:,'Cabin'] = train.loc[:,'Cabin'].fillna('NInfo')
    test.loc[:,'Cabin'] = test.loc[:,'Cabin'].fillna('NInfo')
    
    #train['Cabin'] = train['Cabin'].str[0]
    #test['Cabin'] = test['Cabin'].str[0]
    
    
    #fare fill nans
    
    train.loc[train['Survived']==0,'Fare'] = train.loc[train['Survived']==0,'Fare'].fillna(train.loc[train['Survived']==0,'Fare'].median())
    train.loc[train['Survived']==1,'Fare'] = train.loc[train['Survived']==1,'Fare'].fillna(train.loc[train['Survived']==1,'Fare'].median())
    
    test.loc[:,'Fare'] = test.loc[:,'Fare'].fillna(test.loc[:,'Fare'].median())
    
    #embarked fill nans
    
    train.loc[train['Survived']==0,'Embarked'] = train.loc[train['Survived']==0,'Embarked'].fillna(train.loc[train['Survived']==0,'Embarked'].mode()[0])
    train.loc[train['Survived']==1,'Embarked'] = train.loc[train['Survived']==1,'Embarked'].fillna(train.loc[train['Survived']==1,'Embarked'].mode()[0])
    
    test.loc[:,'Embarked'] = test.loc[:,'Embarked'].fillna(test.loc[:,'Embarked'].mode()[0])
    
    
    #age preprocess
    #train['AgeBand'] = train['Age']
    
    train.loc[train['Age']<5,'Age'] = 1
    train.loc[ (train['Age']>=5) & (train['Age']<=14) ,'Age'] = 2
    train.loc[ (train['Age']>14) & (train['Age']<=18) ,'Age'] = 3
    train.loc[ (train['Age']>18) & (train['Age']<=30) ,'Age'] = 4
    train.loc[ (train['Age']>30) & (train['Age']<=60) ,'Age'] = 5
    train.loc[train['Age']>60,'Age'] = 6
    train['Age'] = train['Age'].astype(int)
    
    #test['AgeBand'] = test['Age']
    
    test.loc[test['Age']<5,'Age'] = 1
    test.loc[ (test['Age']>=5) & (test['Age']<=14) ,'Age'] = 2
    test.loc[ (test['Age']>14) & (test['Age']<=18) ,'Age'] = 3
    test.loc[ (test['Age']>18) & (test['Age']<=30) ,'Age'] = 4
    test.loc[ (test['Age']>30) & (test['Age']<=60) ,'Age'] = 5
    test.loc[test['Age']>60,'Age'] = 6
    
    test['Age'] = test['Age'].astype(int)
    
    #FARE preprocess
    
    #train['FareBand'] = train['Fare']
    train.loc[ train['Fare'] <= 8.662, 'Fare'] = 1
    train.loc[(train['Fare'] > 8.662) & (train['Fare'] <= 26.0), 'Fare'] = 2
    train.loc[(train['Fare'] > 26.0) & (train['Fare'] <= 52), 'Fare']   = 3
    train.loc[ train['Fare'] > 52, 'Fare'] = 4
    train['Fare'] = train['Fare'].astype(int)
    
    #test['FareBand'] = test['Fare']
    test.loc[ test['Fare'] <= 8.662, 'Fare'] = 1
    test.loc[(test['Fare'] > 8.662) & (test['Fare'] <= 26.0), 'Fare'] = 2
    test.loc[(test['Fare'] > 26.0) & (test['Fare'] <= 52), 'Fare']   = 3
    test.loc[ test['Fare'] > 52, 'Fare'] = 4
    test['Fare'] = test['Fare'].astype(int)
    
    #NAME preprocess
    
    train['Name'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    test['Name'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
        
    train['Name'] = train['Name'].replace(['Lady', 'Countess','Capt', 'Col',
                                           'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    train['Name'] = train['Name'].replace('Mlle', 'Miss')
    train['Name'] = train['Name'].replace('Ms', 'Miss')
    train['Name'] = train['Name'].replace('Mme', 'Mrs')
    
    
    test['Name'] = test['Name'].replace(['Lady', 'Countess','Capt', 'Col','Don',
                                         'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    test['Name'] = test['Name'].replace('Mlle', 'Miss')
    test['Name'] = test['Name'].replace('Ms', 'Miss')
    test['Name'] = test['Name'].replace('Mme', 'Mrs')
    
    #Parch and SibSp
    train['Alone'] = train['Parch'] + train['SibSp']
    train.loc[ train['Alone'] > 0 ,'Alone'] =1
    
    test['Alone'] = test['Parch'] + test['SibSp']
    test.loc[ test['Alone']>0,'Alone'] =1
    
    #DELETE
    train = train.drop(['Survived','PassengerId'], axis=1)
    test = test.drop(['PassengerId'], axis=1)
    
    return (train,test)

def pre (train,test) :
    NUMERIC_COLUMNS = train.select_dtypes(np.number)
    CATEGORICAL_COLUMNS = train.select_dtypes(exclude=[np.number])
    
    #FILL NANS TRAIN DATA
    
    for num in NUMERIC_COLUMNS:
        if (num != 'Survived') :
            train.loc[train['Survived']==0,num] = train.loc[train['Survived']==0,num].fillna(train.loc[train['Survived']==0,num].median())
            train.loc[train['Survived']==1,num] = train.loc[train['Survived']==1,num].fillna(train.loc[train['Survived']==1,num].median())
    
    for cat in CATEGORICAL_COLUMNS:
        train.loc[train['Survived']==0,cat] = train.loc[train['Survived']==0,cat].fillna(train.loc[train['Survived']==0,cat].mode()[0])
        train.loc[train['Survived']==1,cat] = train.loc[train['Survived']==1,cat].fillna(train.loc[train['Survived']==1,cat].mode()[0])
    
    #ERASE SURVIVED COLUM
    
    train  = train.drop(['Survived'], axis=1)
    
    #FILL NANS TEST DATA
    data = pd.concat([train,test])
    
    NUMERIC_COLUMNS = data.select_dtypes(np.number)
    CATEGORICAL_COLUMNS = data.select_dtypes(exclude=[np.number])
    
    for num in NUMERIC_COLUMNS:
        test.loc[:,num] = test.loc[:,num].fillna(data.loc[:,num].median())
    
    for cat in CATEGORICAL_COLUMNS:
        test.loc[:,cat] = test.loc[:,cat].fillna(data.loc[:,cat].mode()[0])
    
    
    #AGE
    """
    train.loc[ (train['Age']>=0) & (train['Age']<=25) ,'Age'] = 1
    train.loc[ (train['Age']>25) & (train['Age']<=31) ,'Age'] = 2
    train.loc[train['Age']>31,'Age'] = 3
    train['Age'] = train['Age'].astype(int)
    

    test.loc[ (test['Age']>=0) & (test['Age']<=25) ,'Age'] = 1
    test.loc[ (test['Age']>25) & (test['Age']<=31) ,'Age'] = 2
    test.loc[test['Age']>31,'Age'] = 3
    test['Age'] = test['Age'].astype(int)
    
    #FARE
    
    train.loc[ train['Fare'] <= 8.662, 'Fare'] = 1
    train.loc[(train['Fare'] > 8.662) & (train['Fare'] <= 26.0), 'Fare'] = 2
    train.loc[(train['Fare'] > 26.0), 'Fare']   = 3

    train['Fare'] = train['Fare'].astype(int)
    
    test.loc[ test['Fare'] <= 8.662, 'Fare'] = 1
    test.loc[(test['Fare'] > 8.662) & (test['Fare'] <= 26.0), 'Fare'] = 2
    test.loc[(test['Fare'] > 26.0) , 'Fare']   = 3

    test['Fare'] = train['Fare'].astype(int)
    """
    
    #NAME
    
    train['Name'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    test['Name'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
        
    train['Name'] = train['Name'].replace(['Lady', 'Countess','Capt', 'Col',
                                           'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    train['Name'] = train['Name'].replace('Mlle', 'Miss')
    train['Name'] = train['Name'].replace('Ms', 'Miss')
    train['Name'] = train['Name'].replace('Mme', 'Mrs')
    
    
    test['Name'] = test['Name'].replace(['Lady', 'Countess','Capt', 'Col','Don',
                                         'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    test['Name'] = test['Name'].replace('Mlle', 'Miss')
    test['Name'] = test['Name'].replace('Ms', 'Miss')
    test['Name'] = test['Name'].replace('Mme', 'Mrs')
    
    #Parch and SibSp
    train['Alone'] = train['Parch'] + train['SibSp']
    train.loc[ train['Alone'] > 0 ,'Alone'] =1
    
    test['Alone'] = test['Parch'] + test['SibSp']
    test.loc[ test['Alone']>0,'Alone'] =1
    
    #DELETE
    train = train.drop(['Cabin','Ticket','PassengerId'], axis=1)
    test = test.drop(['Cabin','Ticket','PassengerId'], axis=1)

    return (train,test)

def fine_tune_xgb_sklearn(X_train, Y_train, seed):
    RS_CV = 5
    RS_N_ITER = 2
    RS_N_JOBS = -1
    n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 30)]
                    
    
    random_grid = {
                 'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
                 'max_depth' : [ 3, 4, 5, 6, 8, 10,15],
                 'min_child_weight' : [ 1, 3, 5, 7,9],
                 'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ,1],
                 'colsample_bytree': [ 0.3, 0.4, 0.5 ,0.6,0.7,1],
                 'n_estimators': n_estimators
                }
    
     
    clf_xgb = XGBClassifier(#n_estimators =3000,
                            verbosity=0,
                            objective='binary:logistic',
                            booster='gbtree',
                            #n_jobs=-1,
                            #nthread=None,
                            #max_delta_step=0,
                            subsample=0.7,
                            #colsample_bylevel=1,
                            #colsample_bynode=1,
                            #reg_alpha=0,
                            #reg_lambda=1,
                            #scale_pos_weight=1,
                            #base_score=0.5,
                            #random_state=0,
                            verbose=0,
                            #seed=None
                           )

    
    clf_random = RandomizedSearchCV(estimator = clf_xgb,
                                    param_distributions = random_grid,
                                    n_iter = RS_N_ITER, 
                                    cv = RS_CV,
                                    verbose=0,
                                    random_state=seed, n_jobs = RS_N_JOBS)
    
    clf_random.fit(X_train, Y_train.astype(int))
    print(clf_random.best_params_)
    return clf_random

def fine_tune_RCF_sklearn(X_train, Y_train, seed):
    n_estimators = [int(x) for x in np.linspace(start = 400, stop = 700, num = 14)]
    max_features = ['sqrt','log2']
    max_depth = [int(x) for x in np.linspace(10, 55, num = 10)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
    
    clf = RandomForestClassifier(random_state = seed)
    clf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 2, cv = 5, verbose=2, random_state=seed, n_jobs = -1)
    clf_random.fit(X_train, Y_train.astype(int))
    print(clf_random.best_params_)
    return clf_random
    
def fine_tune_cat_sklearn(X_train, Y_train, seed):
    
    RS_CV = 5 
    RS_N_ITER = 2
    RS_N_JOBS = -1

    iterations = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 5)]
    
    random_grid = {
                   'depth':[8,10],
                   'learning_rate': [0.1,0.01],
                   'iterations': iterations
                   }
    
    clf = CatBoostClassifier(
                            #iterations=1000,         # Reduced iterations
                            l2_leaf_reg=3.0,         # Increased L2 regularization term
                            #eval_metric='Accuracy',
                            random_seed=seed,
                            verbose=0,
                            #loss_function ='accuracy'
                           )

    clf_random = RandomizedSearchCV(estimator = clf,
                                    param_distributions = random_grid,
                                    n_iter = RS_N_ITER, 
                                    cv = RS_CV,
                                    verbose=0, random_state=seed, n_jobs = RS_N_JOBS)
    
    clf_random.fit(X_train, Y_train.astype(int))
    print(clf_random.best_params_)
    return clf_random

def tab_net(trainx,trainy,seed):
    X_train, X_valid, Y_train,Y_valid = train_test_split(trainx, trainy, test_size = 0.2, random_state =seed,stratify=trainy)
    
    tb_cls = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                               optimizer_params=dict(lr=1e-3),
                               scheduler_params={"step_size":10, "gamma":0.9},
                               scheduler_fn=torch.optim.lr_scheduler.StepLR,
                               verbose=1,
                               seed=seed,
                               mask_type='entmax' # "sparsemax" entmax
                               )

    tb_cls.fit(X_train,Y_train,
                               eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
                               eval_name=['train', 'valid'],
                               eval_metric=['auc'],
                               max_epochs=100 , patience=10,
                               batch_size=32, drop_last=False)    
    return tb_cls

train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')
submission = pd.read_csv('titanic/sub.csv')

Y_train = train['Survived'].to_numpy()

(train,test) = pre2(train,test)

(X_train,X_test)=convert(train,test)

x=X_train.copy()
y=Y_train.copy()
seed = 13
X_train, X_valid, Y_train,Y_valid = train_test_split(X_train, Y_train, test_size = 0.2, random_state =seed,stratify=Y_train)

model1 = fine_tune_xgb_sklearn ( X_train,Y_train, seed)
model2 = fine_tune_RCF_sklearn ( X_train,Y_train, seed)
model3 = fine_tune_cat_sklearn ( X_train,Y_train, seed)
model4 = tab_net(x,y,seed)
    
# Test model and generate prediction

print(model1.score(X_train,Y_train))
print(model2.score(X_train,Y_train))
print(model3.score(X_train,Y_train))

print(model1.score(X_valid,Y_valid))
print(model2.score(X_valid,Y_valid))
print(model3.score(X_valid,Y_valid))

result1= model1.predict(X_test)
result2= model2.predict(X_test)
result3= model3.predict(X_test)
result4= model4.predict(X_test)

submission['Survived'] =result1
name = 'submission01.csv'
submission.to_csv(name,index=False)

submission['Survived'] =result2
name = 'submission02.csv'
submission.to_csv(name,index=False)

submission['Survived'] =result3
name = 'submission03.csv'
submission.to_csv(name,index=False)

submission['Survived'] =result4
name = 'submission04.csv'
submission.to_csv(name,index=False)

{'n_estimators': 453, 'min_child_weight': 1, 'max_depth': 15, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.7}
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    1.6s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.2s finished


{'n_estimators': 446, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 15, 'bootstrap': True}




{'learning_rate': 0.01, 'iterations': 500, 'depth': 10}


  return torch._C._cuda_getDeviceCount() > 0


epoch 0  | loss: 0.76022 | train_auc: 0.62062 | valid_auc: 0.6913  |  0:00:01s
epoch 1  | loss: 0.66933 | train_auc: 0.67657 | valid_auc: 0.76522 |  0:00:01s
epoch 2  | loss: 0.63906 | train_auc: 0.70686 | valid_auc: 0.76943 |  0:00:02s
epoch 3  | loss: 0.62493 | train_auc: 0.74422 | valid_auc: 0.7747  |  0:00:02s
epoch 4  | loss: 0.59997 | train_auc: 0.76986 | valid_auc: 0.78485 |  0:00:03s
epoch 5  | loss: 0.55949 | train_auc: 0.7872  | valid_auc: 0.79881 |  0:00:03s
epoch 6  | loss: 0.56901 | train_auc: 0.80452 | valid_auc: 0.81555 |  0:00:04s
epoch 7  | loss: 0.53828 | train_auc: 0.81798 | valid_auc: 0.82661 |  0:00:04s
epoch 8  | loss: 0.53308 | train_auc: 0.82038 | valid_auc: 0.83557 |  0:00:05s
epoch 9  | loss: 0.5148  | train_auc: 0.83022 | valid_auc: 0.83597 |  0:00:06s
epoch 10 | loss: 0.48078 | train_auc: 0.83839 | valid_auc: 0.83702 |  0:00:07s
epoch 11 | loss: 0.50839 | train_auc: 0.84374 | valid_auc: 0.82688 |  0:00:07s
epoch 12 | loss: 0.4952  | train_auc: 0.85004 | vali



0.8890449438202247
0.9297752808988764
0.8268156424581006
0.8156424581005587
0.8324022346368715


In [27]:
#https://www.redcrab-software.com/en/Calculator/Softmax

w1 = 0.35
w2 = 0.33
w3 = 0.32

df1 = pd.read_csv('submission01.csv')
df2 = pd.read_csv('submission02.csv')
df3 = pd.read_csv('submission03.csv')

res = df1.values[:,1]*w1 + df2.values[:,1]*w2 + df3.values[:,1]*w3 

df_final = df1.copy()
df_final['Survived'] = res
df_final.loc[ df_final['Survived'] <0.5, 'Survived'] = 0
df_final.loc[ df_final['Survived'] >=0.5, 'Survived'] = 1
df_final['Survived']=df_final['Survived'].astype('int32')
df_final.to_csv('final.csv',index=False)