In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, plot_importance

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.ensemble import GradientBoostingClassifier,HistGradientBoostingClassifier

import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier

import re
import seaborn as sns
import math
import category_encoders as ce

2024-05-20 21:40:34.471348: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-20 21:40:34.471502: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-20 21:40:34.638978: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic-data/test_titanic.csv
/kaggle/input/titanic-data/train_titanic.csv


In [3]:
train_titanic = pd.read_csv('/kaggle/input/titanic-data/train_titanic.csv')
test_titanic = pd.read_csv('/kaggle/input/titanic-data/test_titanic.csv')
all_data = pd.concat([train_titanic, test_titanic], axis=0)

In [4]:
# Check data
print(train_titanic.shape)
print(test_titanic.shape)
print(all_data.shape)
print(all_data.info())

(891, 12)
(418, 11)
(1309, 12)
<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB
None


In [5]:
def data_correction(df): # df: all data
    #1
    df["Embarked"] = df["Embarked"].fillna("S") # Only 2 entries, PasId = 60 & 828
    
    #2
    df.iloc[1043,9] = df[(df["Pclass"]==3) & (df["Embarked"]=="S")]["Fare"].mean() # 9=Fare, 14.1, Only 1 entry
    return df

In [6]:
def data_conv(df):
    print("df ",df.shape)
    # Emphasize Fare
    df.loc[:,"Fare"] = df.loc[:,"Fare"].rank(method='max')
    
    # Add Single (0 or 1)　#12
    df.loc[:,'Single'] = True
    df.loc[:,'Single'] = df.loc[:,'Single'].where((df['SibSp'] == 0) & (df['Parch'] == 0), False)
    df.loc[:,'Single'] = df.loc[:,'Single'].astype('bool')

    # Add Title   
    df.loc[:,'Title']=""
    i = 0
    yoko = 13
    for temp in df['Name']:
        st = ''.join(re.findall(',\s([a-zA-Z]+)',temp)) #Heikkinen, Miss. Laina
        if st!="":
            df.iloc[i,yoko] = st
    
        if (st=='the'):
            df.iloc[i,yoko] = 'Others'
        elif (st=='Ms')|(st=='Mlle')|(st=='Mme')|(st=='Lady'):
            df.iloc[i,yoko] = 'Miss'
        elif (st=='Dona'):
            df.iloc[i,yoko] = 'Mrs'
        elif (st=='Sir')|(st=='Dr')|(st=='Rev')|(st=='Capt')|(st=='Col')|(st=='Don')|(st=='Jonkheer')|(st=='Major'):
            df.iloc[i,yoko] = 'Mr'
        i=i+1
    
    # Add Last Name #14
    df['Last']=[str.split(',')[0] for str in df['Name']]
    #df['Last'] = df['Name'].apply(lambda x: str.split(x, ",")[0])

    # Add Old Last Name #15
    df.loc[:,'OldName']="None"
    i = 0
    yoko = 15
    for temp in df['Name']:
        st = ''.join(re.findall('\s([a-zA-Z]+)\)',temp)) # Ahlin, Mrs. Johan (Johanna Persdotter Larsson)
        if st!="":
            df.iloc[i,yoko] = st
        else:
            df.iloc[i,yoko] = df.iloc[i,14]
        i=i+1
    
    # Add Cabin first letter #16
    df['Cabin'] = df['Cabin'].astype("str")
    i = 0
    yoko = 16
    df['CabinA']='N'

    for temp in df['Cabin']:
        st = re.findall("[A-Z]",temp)
        if st!=[]:
            df.iloc[i,yoko] = ''.join(st[0])
        else:
            df.iloc[i,yoko] = 'N'
        i=i+1
    
    return df

In [7]:
def data_conv2(df):
    # Add Couple #17
    df.loc[:,'Couple']=False
    yoko = 17
    L = df.shape[0]-1
    for i in range(L-1):
        if (math.isnan(df.iloc[i,5])) & (df.iloc[i,6]==1):# 5=Age, 6=SibSp
            for j in range(L):
                if (j!=i) & (df.iloc[j,6]==1) & (df.iloc[j,14]==df.iloc[i,14]) & ((df.iloc[j,5]>18)|(math.isnan(df.iloc[j,5]))): # 14=Last
                    if (((df.iloc[j,13]=='Mr')&(df.iloc[i,13]=='Mrs'))|((df.iloc[j,13]=='Mrs')&(df.iloc[i,13]=='Mr'))):
                        df.iloc[j,yoko]=True
                        df.iloc[i,yoko]=True
                    if math.isnan(df.iloc[j,5]):
                        k=1
                    else :  # 5=Age
                        df.iloc[i,5]=df.iloc[j,5]
    # Use ave 33 if no Age
    for i in range(L):
        if df.iloc[i,yoko]==True:
            df.iloc[i,5]=33
    
    # Add Child(0 or 1) #18
    df.loc[:,'Child'] = True
    df['Child'] = df['Child'].where((df['Age'] < 19.0), False)
    df['Child'] = df['Child'].astype('bool')
    
    # Add Family number w/ SibSp+Parch #19
    df.loc[:,'Family'] = 0
    df['Family'] = df['SibSp'] + df['Parch']# 6=SibSp, 7=Parch
    
    # Add FamilySize (S or M or L) #20
    df.loc[:,'FamilyS']="L"
    for i in range(L):
        if df.iloc[i,19]==0.0:
            df.iloc[i,20]="S"
        elif df.iloc[i,19]<=4.0:
            df.iloc[i,20]="M"

    return df

In [8]:
# Fill Age based on title & sex
def data_fill_age(df):
    
    # Masterは特別に子供年齢を上書きする
    for i in range(df.shape[0]):
        if (math.isnan(df.iloc[i,5])) & (df.iloc[i,13]=='Master'):  # 5=Age, 13=Title
            df.iloc[i,5]=6 # 5=Age

    # その他は、Pclass別の平均年齢を入れておく
    j=0
    for i in range(df.shape[0]):
        if math.isnan(df.iloc[i,5])==True :
            j=j+1
            if (df.iloc[i,2]==1) & (df.iloc[i,4]=="female"):
                df.iloc[i,5]=36
            elif (df.iloc[i,2]==1) & (df.iloc[i,4]=="male"):
                df.iloc[i,5]=42
            elif (df.iloc[i,2]==2) & (df.iloc[i,4]=="female"):
                df.iloc[i,5]=28
            elif (df.iloc[i,2]==2) & (df.iloc[i,4]=="male"):
                df.iloc[i,5]=29.5
            elif (df.iloc[i,2]==3) & (df.iloc[i,4]=="female"):
                df.iloc[i,5]=22
            elif (df.iloc[i,2]==3) & (df.iloc[i,4]=="male"):
                df.iloc[i,5]=25        
    
    return df
    

In [9]:
# Fill Cabin Letter based on Fare
def data_fill_cabina(df):
    for i in range(df.shape[0]):
        if df.iloc[i,16]=="N":# 16=CabinA, 9=Fare
            match df.iloc[i,2]: # 2=Pclass
                case 3:
                    match df.iloc[i,11]: # 11=Embarked
                        case "S":
                            if (df.iloc[i,9]<9.3):  # 9=Fare
                                df.iloc[i,16]="F"  # 16=CabinA
                            elif (df.iloc[i,9]>=9.3) & (df.iloc[i,9]<12.6):
                                df.iloc[i,16]="E"
                            else:
                                df.iloc[i,16]="G"
                                
                        case _:
                            df.iloc[i,16]="F"
                    
                case 2:
                    match df.iloc[i,11]: # 11=Embarked
                        case "S":
                            if (df.iloc[i,9]<12.2):  # 9=Fare
                                df.iloc[i,16]="E"  # 16=CabinA
                            elif (df.iloc[i,9]>=12.2) & (df.iloc[i,9]<18.2):
                                df.iloc[i,16]="D"
                            else:
                                df.iloc[i,16]="F"
                        case "Q":
                            df.iloc[i,16]="E"
                        case _:
                            df.iloc[i,16]="D"
                case 1:
                    match df.iloc[i,11]: # 11=Embarked
                        case "C":
                            if (df.iloc[i,9]<55.5):  # 9=Fare
                                df.iloc[i,16]="A"  # 16=CabinA
                            elif (df.iloc[i,9]>=55.5) & (df.iloc[i,9]<87.4):
                                df.iloc[i,16]="D"
                            elif (df.iloc[i,9]>=87.4) & (df.iloc[i,9]<102.7):
                                df.iloc[i,16]="E"
                            elif (df.iloc[i,9]>=102.7) & (df.iloc[i,9]<136.9):
                                df.iloc[i,16]="C"
                            else:
                                df.iloc[i,16]="B"
                        case "S":
                            if (df.iloc[i,9]<41.0):  # 9=Fare
                                df.iloc[i,16]="T"  # 16=CabinA
                            elif (df.iloc[i,9]>=41.0) & (df.iloc[i,9]<46.9):
                                df.iloc[i,16]="E"
                            elif (df.iloc[i,9]>=46.9) & (df.iloc[i,9]<47.5):
                                df.iloc[i,16]="A"
                            elif (df.iloc[i,9]>=47.5) & (df.iloc[i,9]<63.2):
                                df.iloc[i,16]="D"
                            elif (df.iloc[i,9]>=63.2) & (df.iloc[i,9]<94.5):
                                df.iloc[i,16]="B"
                            else:
                                df.iloc[i,16]="C"
                        case _:
                            df.iloc[i,16]="C"
            
    return df

In [10]:
# Reduce CabinA into 4, 2nd try
def data_simplify_cabina2(df):
    for i in range(df.shape[0]):
        if (df.iloc[i,16]=="G"):# 16=CabinA, 9=Fare
            df.iloc[i,16]="A"
        elif (df.iloc[i,16]=="D")|(df.iloc[i,16]=="E"):
            df.iloc[i,16]="B"
        elif (df.iloc[i,16]=="F"):
            df.iloc[i,16]="C"
    return df

In [12]:
# Grouping
def grouping(df):
    both = df
    # nan if Fare is Null
    both['Name_Fare'] = both['Last'] + both['Fare'].astype('str')
    
    # Group w/ Name & Fare
    def process_name(i, name_fare):
        tickets = both.loc[(both['Name_Fare'] == name_fare) & (both['Group'].isnull()),'Ticket'].unique().tolist()
        both.loc[(both['Name_Fare'] == name_fare) & (both['Group'].isnull()), 'Group'] = i
        for ticket in tickets:
            process_ticket(i, ticket)

    # Group w/ Ticket        
    def process_ticket(i, ticket):
        name_fares = both.loc[(both['Ticket'] == ticket) & (both['Group'].isnull()),'Name_Fare'].unique().tolist()
        both.loc[(both['Ticket'] == ticket) & (both['Group'].isnull()), 'Group'] = i
        for name_fare in name_fares:
            process_name(i, name_fare)

    both['Group'] = None

    # Group w/ Ticket (Regression)
    [process_ticket(i, ticket) for i, ticket in enumerate(both['Ticket'].unique().tolist())]
    
    train = both[both['Survived'].notna()]
    test  = both[both['Survived'].isna()]
    
    count_encoder = ce.CountEncoder(cols=['Group'], handle_unknown=0, return_df=True)
    train['Group_Count'] = count_encoder.fit_transform(train['Group'])
    test['Group_Count'] = count_encoder.transform(test['Group']).astype('int') + 1  
    # Add 1
    te = ce.LeaveOneOutEncoder(cols=['Group'])
    train['Group_Target'] = te.fit_transform(train['Group'], train['Survived'])
    test['Group_Target'] = te.transform(test['Group'])

    for index, row in test.query('Group_Count == 2').iterrows():
        test.at[index, 'Group_Target'] = train[train['Group']==row['Group']]['Survived']
    test.loc[test['Group_Count'] == 0, 'Group_Target'] = 0.384
    
    both = pd.concat([train,test], axis=0)
    
    return both

In [13]:
# Add side of cabin 0,1,-1(unknown) #
def add_side(df):
    L = 21   # L=Side
    print(L)
    replaces = {'B51 B53 B55': 'B55', 'B52 B54 B56': 'B56', 'B57 B59 B63 B66': 'B66', 'B58 B60': 'B60', 
            'B82 B84': 'B84', 'B96 B98': 'B98', 'C22 C26': 'C26', 'C23 C25 C27': 'C27', 'C55 C57': 'C57',
            'C62 C64': 'C64', 'D10 D12': 'D12', 'E39 E41': 'E41', 'F E46': 'E46', 'F E57': 'E57',
            'F E69': 'E69', 'F G63': 'G63', 'F G73': 'G73', 'F': 'N', 'D': 'N', ' ': 'N', 'T': 'N', np.nan: 'N'}
    df["Cabin"] = df["Cabin"].replace(replaces)

    # Assign N, R, L #Side = 21

    df["Side"] = 'N' #Unknown
    for i in range(df.shape[0]):
        #print(df.iloc[i,L])
        if (df.iloc[i,10]!="N"):     # 10=Cabin
            if (df.iloc[i,10][1:]!="an"):
                temp = float(df.iloc[i,10][1:])
                #print(temp)
                if temp!=0.0:
                    if temp%2 == 0.0:
                        df.iloc[i,L]="R"
                    else:
                        df.iloc[i,L]="L"

    return df

In [14]:
# Add Elder (0 or 1)
def add_elder(df):
    df['Elder'] = True
    df['Elder'] = df['Elder'].where(df['Age'] > 60 , False)
    return df

In [15]:
# Process featuring
df31 = data_correction(all_data) # EmbarkミスをSにする
df32 = data_conv(df31) # Last,Title他を追記
df33 = data_simplify_cabina2(df32) # ４種類にCabinAを分類する
#df33.to_csv('./temp1.csv', index=False)
df34 = data_fill_age(df33) # Masterを平均６歳に設定, Coupleは３３歳、その他は以前と同じ
#df34.to_csv('./temp1.csv', index=False)
df35a = data_conv2(df34)  # CoupleをLastから作成
# 船の右か左か追加する
df35c = add_side(df35a)
# 老人追加する
df35d = add_elder(df35c)
# グルーピング
df35 = grouping(df35d)
df35.to_csv('./temp1.csv', index=False)

df  (1309, 12)
<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1309 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1309 non-null   float64
 10  Cabin        1309 non-null   object 
 11  Embarked     1309 non-null   object 
 12  Single       1309 non-null   bool   
 13  Title        1309 non-null   object 
 14  Last         1309 non-null   object 
 15  OldName      1309 non-null   object 
 16  CabinA       1309 non-null   object 
 17  Couple       1309 non-null   bool   
dtypes: bool(2), float64(3), int64(4), objec

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Group_Count'] = count_encoder.fit_transform(train['Group'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Group_Count'] = count_encoder.transform(test['Group']).astype('int') + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.p

In [16]:
# Generate Dummy
df35z = pd.get_dummies(df35, drop_first=True, columns=['Single','Pclass','Sex','Title','Embarked','CabinA','Child','FamilyS'])#,'CabinA','Child','FamilyS','Side','Elder'

# Drop Columns  #'SibSp','Parch',
df36 = df35z[df35z['Survived'].notna()].drop(['SibSp','Parch','Name_Fare','Group','Last','Name','Ticket','PassengerId','OldName','Couple','Cabin','Family','Side','Elder'], axis=1)#.dropna()#,'CabinA'


In [17]:
df36.loc[:, ["Survived", "Pclass_2","Pclass_3","Age", "Fare","Single_True","Sex_male","Title_Miss","Title_Mr","Title_Mrs","Title_Others","CabinA_B","CabinA_C","CabinA_N","Child_True","FamilyS_M","FamilyS_S"]].corr().style.background_gradient(axis=None)

Unnamed: 0,Survived,Pclass_2,Pclass_3,Age,Fare,Single_True,Sex_male,Title_Miss,Title_Mr,Title_Mrs,Title_Others,CabinA_B,CabinA_C,CabinA_N,Child_True,FamilyS_M,FamilyS_S
Survived,1.0,0.093349,-0.322308,-0.059773,0.321521,-0.203367,-0.543351,0.341294,-0.563879,0.33904,0.04247,0.28549,0.130096,-0.316912,0.107582,0.263023,-0.203367
Pclass_2,0.093349,1.0,-0.56521,0.027979,0.084092,-0.03907,-0.064746,-0.024628,-0.068603,0.121239,-0.0171,-0.12653,-0.069875,0.172413,-0.00401,0.091023,-0.03907
Pclass_3,-0.322308,-0.56521,1.0,-0.364366,-0.62008,0.129472,0.137143,-0.005815,0.103925,-0.174671,-0.037138,-0.399679,-0.287103,0.539291,0.118014,-0.209764,0.129472
Age,-0.059773,0.027979,-0.364366,1.0,0.153377,0.183461,0.096007,-0.266467,0.263998,0.174656,0.009736,0.223803,0.102975,-0.280397,-0.603444,-0.09389,0.183461
Fare,0.321521,0.084092,-0.62008,0.153377,1.0,-0.527475,-0.257334,0.085733,-0.307422,0.247221,0.049397,0.38209,0.308005,-0.534901,0.099061,0.434395,-0.527475
Single_True,-0.203367,-0.03907,0.129472,0.183461,-0.527475,1.0,0.303646,-0.049021,0.411492,-0.365454,0.027216,-0.107248,-0.121132,0.158029,-0.313578,-0.892993,1.0
Sex_male,-0.543351,-0.064746,0.137143,0.096007,-0.257334,0.303646,1.0,-0.698647,0.905908,-0.5476,-0.045439,-0.145489,-0.05712,0.140391,-0.112678,-0.290733,0.303646
Title_Miss,0.341294,-0.024628,-0.005815,-0.266467,0.085733,-0.049021,-0.698647,1.0,-0.636265,-0.208197,-0.017276,0.070615,0.008989,-0.053692,0.23267,0.026492,-0.049021
Title_Mr,-0.563879,-0.068603,0.103925,0.263998,-0.307422,0.411492,0.905908,-0.636265,1.0,-0.498706,-0.041382,-0.115092,-0.062934,0.121126,-0.320985,-0.334969,0.411492
Title_Mrs,0.33904,0.121239,-0.174671,0.174656,0.247221,-0.365454,-0.5476,-0.208197,-0.498706,1.0,-0.013541,0.100284,0.069948,-0.1183,-0.114998,0.373547,-0.365454


In [18]:
# Prepare Train & Test data w/ Survived
df02=df36[df36['Survived'].notna()]
y=df02['Survived']
X=df02.drop(['Survived'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=0)

In [19]:
# 1) XGBoost classifier
xgb = XGBClassifier(objective='binary:logistic', random_state=0)

# Create a dictionary of hyperparameters to tune
cv_params = {'max_depth': [6,7,8], #best [2,5,7,9]
             'min_child_weight': [2,3,4],#best [3,5,7,9]
             'learning_rate': [0.05,0.1],
             'n_estimators': [80,81,82], #best 20,40,60,80,
             'eval_metric':['auc']
             }

# Define a set of scoring metrics to capture
scoring = {'accuracy', 'precision', 'recall', 'f1'}

# Instantiate the GridSearchCV object
xgb_cv = GridSearchCV(xgb, cv_params, scoring=scoring, cv=4, refit='accuracy')#f1

In [20]:
%%time
xgb_cv.fit(X_train, y_train)
print(xgb_cv.best_score_)
print(xgb_cv.best_params_)

0.8677114427860697
{'eval_metric': 'auc', 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 80}
CPU times: user 1min 29s, sys: 1.6 s, total: 1min 31s
Wall time: 23.7 s


In [21]:
# 3) Gradient Boosting classifier
gbc = GradientBoostingClassifier(loss='log_loss',random_state=0)

# Create a dictionary of hyperparameters to tune
cv_params = {'max_depth': [1,2,3],# best [2,3,4,5]
             'learning_rate': [0.05,0.1],
             'n_estimators': [78,79,80]  #best [60,80,100]
             }

# Define a set of scoring metrics to capture
scoring = {'accuracy', 'precision', 'recall', 'f1'}

# Instantiate the GridSearchCV object
gbc_cv = GridSearchCV(gbc, cv_params, scoring=scoring, cv=4, refit='accuracy')#f1

In [22]:
%%time
gbc_cv.fit(X_train, y_train)
print(gbc_cv.best_score_)
print(gbc_cv.best_params_)

0.862723880597015
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 78}
CPU times: user 8.17 s, sys: 5.78 ms, total: 8.18 s
Wall time: 8.18 s


In [23]:
#5 HGBC : HistGradientBoostingClassifier
hgbc = HistGradientBoostingClassifier(loss='log_loss',random_state=0)

# Create a dictionary of hyperparameters to tune
cv_params = {'max_depth': [5,6,7],#best [2,3,4,5]
             'learning_rate': [0.05,0.1],
             'max_iter': [98,99,100,101,102] #[50,100,150]
             }

# Define a set of scoring metrics to capture
scoring = {'accuracy', 'precision', 'recall', 'f1'}

# Instantiate the GridSearchCV object
hgbc_cv = GridSearchCV(hgbc, cv_params, scoring=scoring, cv=4, refit='accuracy')#f1

In [24]:
%%time
hgbc_cv.fit(X_train, y_train)
print(hgbc_cv.best_score_)
print(hgbc_cv.best_params_)

0.8714427860696518
{'learning_rate': 0.05, 'max_depth': 5, 'max_iter': 98}
CPU times: user 2min 28s, sys: 3.31 s, total: 2min 31s
Wall time: 38.8 s


In [25]:
# Ensemble learning2, voting,

# XGB
model1=xgb_cv
y_pred1 = model1.best_estimator_.predict(X_test)

# HGBC
model2=hgbc_cv
y_pred2 = model2.best_estimator_.predict(X_test)

# GBC
model3=gbc_cv
y_pred3 = model3.best_estimator_.predict(X_test)

y_pred = (y_pred1 + y_pred2 + y_pred3) / 3
  
y_pred[(y_pred >= 0.5)] = 1
y_pred[(y_pred < 0.5)] = 0
y_pred = np.array(y_pred, dtype='int')
print(y_pred)

[0 1 0 1 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 0
 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 1 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1]


In [26]:
print("Accuracy:", "%.3f" % accuracy_score(y_test, y_pred))
print("Precision:", "%.3f" % precision_score(y_test, y_pred))
print("Recall:", "%.3f" % recall_score(y_test, y_pred))
print("F1 Score:", "%.3f" % f1_score(y_test, y_pred))

Accuracy: 0.811
Precision: 0.781
Recall: 0.714
F1 Score: 0.746


In [27]:
df37 = pd.get_dummies(df35, drop_first=True, columns=['Single','Pclass','Sex','Title','Embarked','CabinA','Child','FamilyS'])
df02 = df37[df37['Survived'].isna()]
X_test_final = df02.drop(['Survived','SibSp','Parch','Name_Fare','Group','Last','Name','Ticket','PassengerId','OldName','Couple','Cabin','Family','Side','Elder'], axis=1)#,'FamilyS'

In [28]:
# Ensemble 2, voting

def ensamble2(X, m1, m2, m3):
    #XGB
    y_pred1 = m1.best_estimator_.predict(X)

    # HGBC
#    y_pred2 = m2.predict(X)
    y_pred2 = m2.best_estimator_.predict(X)

    # GBC
    y_pred3 = m3.best_estimator_.predict(X)
    
    # Ensemble
    y_pred9 = (y_pred1 + y_pred2 + y_pred3) / 3
    
    y_pred = y_pred9

    y_pred[(y_pred >= 0.5)] = 1
    y_pred[(y_pred < 0.5)] = 0
    y_pred = np.array(y_pred, dtype='int')
    
    return y_pred
 
y_pred_final = ensamble2(X_test_final, xgb_cv,hgbc_cv ,gbc_cv)

In [31]:
y_pred_final.shape

(418,)

In [29]:
submission = pd.DataFrame({
        "PassengerId": test_titanic["PassengerId"],
        "Survived": y_pred_final
    })
submission.to_csv('./temp1.csv', index=False)