In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
import category_encoders as ce
%matplotlib inline

In [2]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")
tr = pd.concat([df_train.drop(['Survived'],axis=1), df_test])
tr.index = tr['PassengerId']
tr.head()

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [49]:
tr.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [50]:
# Identify empty columns.
tr[tr.columns[tr.isnull().any()]].isnull().sum()

Age          263
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [3]:
# Make a title.
def create_name_title(tr):
    tr['NameTitle'] = tr['Name'].str.extract(r'([A-Za-z]+)\.')
    tr['NameTitle'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
                            'Rare',inplace=True)
    tr['NameTitle'].replace(['Mlle', 'Ms', 'Mme'],
                            ['Miss', 'Miss', 'Mrs'],inplace=True)
create_name_title(df_train)
create_name_title(tr)
df_train.groupby('NameTitle')['Survived'].agg(["mean", "size"])

Unnamed: 0_level_0,mean,size
NameTitle,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,0.575,40
Miss,0.702703,185
Mr,0.156673,517
Mrs,0.793651,126
Rare,0.347826,23


In [4]:
# Fill the empty age.
tr_age_mean = tr.groupby('NameTitle')['Age'].mean()
tr_age_null = tr['Age'].isnull() 

tr_age_mean_1 = tr_age_mean[tr.loc[tr_age_null, 'NameTitle']]
tr_age_mean_1.index = tr[tr_age_null].index
tr_age_mean_1
tr.loc[tr_age_null, 'Age'] = tr_age_mean_1
tr[tr['Age'].isnull()]

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameTitle
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


In [5]:
# Make age group features.
band = [0,9,18,27,36,45,53,62,71,100]
df_train['AgeBand'] = pd.cut(df_train['Age'],band)
tr['AgeBand'] = pd.cut(tr['Age'], band)
df_train.groupby('AgeBand')['Survived'].agg(["mean", "size"])

Unnamed: 0_level_0,mean,size
AgeBand,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 9]",0.612903,62
"(9, 18]",0.415584,77
"(18, 27]",0.358586,198
"(27, 36]",0.423077,182
"(36, 45]",0.369565,92
"(45, 53]",0.415094,53
"(53, 62]",0.371429,35
"(62, 71]",0.153846,13
"(71, 100]",0.5,2


In [6]:
# Fill the empty Fare.
tr.loc[tr['Fare'].isnull(), 'Fare'] = tr.query('Age>60 & Pclass==3')['Fare'].mean()

In [7]:
# Fill the empty Embarked.
display(tr.loc[tr['Embarked'].isnull()])     
display(tr.groupby(['Embarked']).apply(lambda x: pd.Series(dict(
    p_1 = (x.Pclass == 1).sum(),
    p_2 = (x.Pclass == 2).sum(),
    p_3 = (x.Pclass == 3).sum(),
    cabin = (x.Cabin == 'B28' ).sum(),
    fare_down = (x.Fare < 80).sum(),
    fare_up = (x.Fare > 80).sum(),
    SibSp = (x.SibSp == 0).sum(),
    Parch = (x.Parch == 0).sum()
))))
tr.loc[tr['Embarked'].isnull(), 'Embarked'] = 'S'    

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameTitle,AgeBand
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
62,62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,Miss,"(36, 45]"
830,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,Mrs,"(53, 62]"


Unnamed: 0_level_0,p_1,p_2,p_3,cabin,fare_down,fare_up,SibSp,Parch
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,141,28,101,0,208,62,171,196
Q,3,7,113,0,120,3,100,114
S,177,242,495,0,864,50,618,690


In [8]:
# PCA Method
from sklearn.decomposition import PCA

def dummy_to_pca(tr, column_name:str, features) :
    max_seq = 300
    max_d = 15
    col_count = tr.groupby(column_name)[column_name].count()
    if len(col_count) > max_seq:
        tops = col_count.sort_values(ascending=False)[0:max_seq].index
        f =tr.loc[tr[column_name].isin(tops)][['PassengerId', column_name]]
    else:
        tops = col_count.index
        f =tr[['PassengerId', column_name]]
    f = pd.get_dummies(f, columns=[column_name])  # This method performs One-hot-encoding
    f = f.groupby('PassengerId').mean()
    if len(tops) < max_d:
        max_d = len(tops)
    pca = PCA(n_components=max_d)
    pca.fit(f)
    cumsum = np.cumsum(pca.explained_variance_ratio_) #분산의 설명량을 누적합
    print(cumsum)
    num_d = np.argmax(cumsum >= 0.99) + 1 # 분산의 설명량이 99%이상 되는 차원의 수
    if num_d == 1:
        num_d = max_d
    pca = PCA(n_components=num_d)    
    result = pca.fit_transform(f)
    result = pd.DataFrame(result)
    result.columns = [column_name + '_' + str(column) for column in result.columns]
    result.index = f.index
    return pd.concat([features, result], axis=1, join_axes=[features.index])

In [9]:
# Mean Encoding
def mean_encoding(tr, feature_name):
    mean = df_train.groupby(feature_name)['Survived'].mean()
    tr.loc[:,feature_name] = tr[feature_name].map(mean)
    #print(tr[feature_name+'Mean'])

In [10]:
# Creates a ticket label variable.
def create_ticket_label(tr):
    tr['TicketLabel'] = tr['Ticket'].str.extract(r'([A-Za-z0-9/.]+) ')
    tr['TicketLabel'] = tr['TicketLabel'].str.replace("\.", "")
    tr['TicketLabel'] = tr['TicketLabel'].str.replace("/", "")
    tr['TicketLabel'] = tr['TicketLabel'].str.upper()
    tr['TicketLabel'].replace(['CASOTON','SCOW', 'AQ3', 'AQ4', 'SOP', 'STONOQ', 'STONO2', 'SCA3', 'A'],
                               ['CA', 'SC', 'AQ', 'AQ', 'SOPP', 'SOTONOQ', 'SOTONO2', 'SC', 'A4'],inplace=True)

create_ticket_label(df_train)
create_ticket_label(tr)
df_train.groupby('TicketLabel')['Survived'].agg(["mean", "size"])

Unnamed: 0_level_0,mean,size
TicketLabel,Unnamed: 1_level_1,Unnamed: 2_level_1
A4,0.0,7
A5,0.095238,21
AS,0.0,1
C,0.4,5
CA,0.333333,42
FA,0.0,1
FC,0.0,1
FCC,0.8,5
PC,0.65,60
PP,0.666667,3


In [11]:
# Creates a ticket label variable.
def create_ticket_a(tr):
    tr['TicketA'] = tr['TicketLabel'].str[:2]
create_ticket_a(tr)
create_ticket_a(df_train)
df_train.groupby('TicketA')['Survived'].agg(["mean", "size"])

Unnamed: 0_level_0,mean,size
TicketA,Unnamed: 1_level_1,Unnamed: 2_level_1
A4,0.0,7
A5,0.095238,21
AS,0.0,1
C,0.4,5
CA,0.333333,42
FA,0.0,1
FC,0.666667,6
PC,0.65,60
PP,0.6,5
SC,0.470588,17


In [12]:
# Refine the Cabin variable.
def create_cabin_a(tr):
    tr['CabinA'] = tr['Cabin'].str[:1]
    tr['CabinB'] = tr['Cabin'].str[:2]
create_cabin_a(tr)
create_cabin_a(df_train)
df_train.groupby(['CabinA','CabinB'])['Survived'].agg(["mean", "size"])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size
CabinA,CabinB,Unnamed: 2_level_1,Unnamed: 3_level_1
A,A1,0.25,4
A,A2,0.75,4
A,A3,0.5,4
A,A5,0.0,1
A,A6,1.0,1
A,A7,0.0,1
B,B1,0.6,5
B,B2,0.833333,6
B,B3,0.571429,7
B,B4,1.0,5


In [13]:
# Create a family number variable.
def create_family_size(tr):
    tr['FamilySize'] = tr['SibSp'] + tr['Parch'] + 1
    tr['IsAlone'] = tr['FamilySize'] == 1
create_family_size(tr)
create_family_size(df_train)
df_train.groupby('FamilySize')['Survived'].agg(["mean", "size"])
#df_train.groupby('IsAlone')['Survived'].agg(["mean", "size"])

Unnamed: 0_level_0,mean,size
FamilySize,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.303538,537
2,0.552795,161
3,0.578431,102
4,0.724138,29
5,0.2,15
6,0.136364,22
7,0.333333,12
8,0.0,6
11,0.0,7


In [25]:
df_train['Survived'].head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [27]:
# Create a feature for learning.
f = tr[['PassengerId','Pclass','Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 
        'FamilySize', 'IsAlone', 'NameTitle','CabinA', 'CabinB', 'AgeBand', 
        'Sex', 'TicketA', 'TicketLabel']]
f.index = f['PassengerId']
#te = ce.LeaveOneOutEncoder(cols=['AgeBand']).fit(df_train[['AgeBand']], df_train['Survived'])
#value = te.transform(f[['AgeBand']])
#print(value)
#f.loc[:,'isMail'] = f['Sex'] == 'mail'
enc = ce.TargetEncoder(cols=['Sex', 'IsAlone', 'AgeBand', 'Embarked', 'NameTitle', 
                                            'CabinA', 'CabinB', 'TicketA', 'TicketLabel'])
enc.fit(df_train.drop(['Survived'], axis=1), df_train['Survived'])
enc.transform(df_train.drop(['Survived'], axis=1))
#enc.transform(tr)
# mean_encoding(f, 'Sex')
# mean_encoding(f, 'IsAlone')
# mean_encoding(f, 'AgeBand')
# mean_encoding(f, 'Embarked')
# mean_encoding(f, 'NameTitle')
# mean_encoding(f, 'CabinA')
# mean_encoding(f, 'CabinB')
# mean_encoding(f, 'TicketA')
# mean_encoding(f, 'TicketLabel')
#f.fillna(0, inplace=True)
#f.drop(['Sex'], axis=1,inplace=True)

# f = pd.get_dummies(f, columns=['Embarked', 'NameTitle', 'CabinA'])
# f = dummy_to_pca(tr, 'TicketA', f)
# f = dummy_to_pca(tr, 'TicketLabel', f)
#f = dummy_to_pca(tr, 'AgeBand', f) 
# f = dummy_to_pca(tr, 'CabinB', f) 
#f.columns
#f

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameTitle,AgeBand,TicketLabel,TicketA,CabinA,CabinB,FamilySize,IsAlone
0,1,3,"Braund, Mr. Owen Harris",0.188908,22.0,1,0,A/5 21171,7.2500,,0.336957,0.156673,0.358586,0.095238,0.095238,0.383838,0.383838,2,0.505650
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.742038,38.0,1,0,PC 17599,71.2833,C85,0.553571,0.793651,0.369565,0.650000,0.650000,0.593220,0.333671,2,0.505650
2,3,3,"Heikkinen, Miss. Laina",0.742038,26.0,0,0,STON/O2. 3101282,7.9250,,0.336957,0.702703,0.358586,0.375008,0.181818,0.383838,0.383838,1,0.303538
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.742038,35.0,1,0,113803,53.1000,C123,0.336957,0.793651,0.423077,0.383838,0.383838,0.593220,0.588235,2,0.505650
4,5,3,"Allen, Mr. William Henry",0.188908,35.0,0,0,373450,8.0500,,0.336957,0.156673,0.423077,0.383838,0.383838,0.383838,0.383838,1,0.303538
5,6,3,"Moran, Mr. James",0.188908,,0,0,330877,8.4583,,0.389610,0.156673,0.383838,0.383838,0.383838,0.383838,0.383838,1,0.303538
6,7,1,"McCarthy, Mr. Timothy J",0.188908,54.0,0,0,17463,51.8625,E46,0.336957,0.156673,0.371429,0.383838,0.383838,0.750000,0.596112,1,0.303538
7,8,3,"Palsson, Master. Gosta Leonard",0.188908,2.0,3,1,349909,21.0750,,0.336957,0.575000,0.612903,0.383838,0.383838,0.383838,0.383838,5,0.505650
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0.742038,27.0,0,2,347742,11.1333,,0.336957,0.793651,0.358586,0.383838,0.383838,0.383838,0.383838,3,0.505650
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",0.742038,14.0,1,0,237736,30.0708,,0.553571,0.793651,0.415584,0.383838,0.383838,0.383838,0.383838,2,0.505650


In [63]:
# Save the result.
X_train = df_train[['PassengerId']]
X_train = pd.merge(X_train, f, how='left')
display(X_train.head())
y_train = df_train.Survived

X_test = df_test[['PassengerId']]
X_test = pd.merge(X_test, f, how='left')
display(X_test.head())

X_train.drop(['PassengerId'], axis=1, inplace=True)
X_test.drop(['PassengerId'], axis=1, inplace=True)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,NameTitle,CabinA,CabinB,AgeBand,Sex,TicketA,TicketLabel
0,1,3,22.0,1,0,7.25,0.336957,2,0.50565,0.156673,0.0,0.0,0.358586,0.188908,0.095238,0.095238
1,2,1,38.0,1,0,71.2833,0.553571,2,0.50565,0.793651,0.59322,0.333333,0.369565,0.742038,0.65,0.65
2,3,3,26.0,0,0,7.925,0.336957,1,0.303538,0.702703,0.0,0.0,0.358586,0.742038,0.181818,0.375
3,4,1,35.0,1,0,53.1,0.336957,2,0.50565,0.793651,0.59322,0.588235,0.423077,0.742038,0.0,0.0
4,5,3,35.0,0,0,8.05,0.336957,1,0.303538,0.156673,0.0,0.0,0.423077,0.188908,0.0,0.0


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,NameTitle,CabinA,CabinB,AgeBand,Sex,TicketA,TicketLabel
0,892,3,34.5,0,0,7.8292,0.38961,1,0.303538,0.156673,0.0,0.0,0.423077,0.188908,0.0,0.0
1,893,3,47.0,1,0,7.0,0.336957,2,0.50565,0.793651,0.0,0.0,0.415094,0.742038,0.0,0.0
2,894,2,62.0,0,0,9.6875,0.38961,1,0.303538,0.156673,0.0,0.0,0.371429,0.188908,0.0,0.0
3,895,3,27.0,0,0,8.6625,0.336957,1,0.303538,0.156673,0.0,0.0,0.358586,0.188908,0.0,0.0
4,896,3,22.0,1,1,12.2875,0.336957,3,0.50565,0.793651,0.0,0.0,0.358586,0.742038,0.0,0.0


In [64]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

#clf_kn = KNeighborsClassifier(3)
#clf_dt = DecisionTreeClassifier(max_depth=5)
#clf_svc = SVC(gamma=2, C=1)
clf_mlp = MLPClassifier()
#clf_gp = GaussianProcessClassifier(1.0 * RBF(1.0))
#clf_qd = QuadraticDiscriminantAnalysis()
# clf_ab = AdaBoostClassifier(n_estimators=400, learning_rate=0.03, algorithm="SAMME.R")    
# clf_gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5, max_depth=2)
# clf_lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
#clf_lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')

clf_xb = XGBClassifier()
clf_rf = RandomForestClassifier()
clf_dt = DecisionTreeClassifier()
clfs = [
    ('xgb', clf_xb),#0.8272
    ('rf', clf_rf), #0.8284
    ('mlp', clf_mlp)
]
clf_eb = VotingClassifier(estimators=clfs, voting='soft')
parameters = {
    'xgb__max_depth':[7], 'xgb__min_child_weight':[4], 'xgb__gamma':[0],
    'xgb__subsample':[0.85], 'xgb__colsample_bytree':[0.84],
    'xgb__reg_alpha':[0.00009], 'xgb__learning_rate':[0.01], 
    "rf__n_estimators":[34], "rf__max_depth":[24], "rf__min_samples_leaf":[1],
    'mlp__solver':['adam'], 'mlp__max_iter':[1000], 'mlp__early_stopping':[True], 
    'mlp__hidden_layer_sizes':[(128,64)],'mlp__activation':['logistic']
    #'dt__max_depth':[2,3,4], 'dt__min_samples_split':np.arange(0.0, 1.0, 0.1)
    #'xgb__subsample':[0.7], # 0.8339
    #'rf__max_depth':[1,2,3], 'rf__n_estimators':[35,40,45],
    #'dt__max_depth':[3,4,5],
    #'ab__n_estimators':[300,400,500], 'ab__learning_rate':[0.01,0.02,0.03], 
    #'ab__algorithm':['SAMME','SAMME.R'],
}
clf = GridSearchCV(clf_eb, parameters, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)

score = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (score.mean(), score.std(), "eb"))

{'mlp__activation': 'logistic', 'mlp__early_stopping': True, 'mlp__hidden_layer_sizes': (128, 64), 'mlp__max_iter': 1000, 'mlp__solver': 'adam', 'rf__max_depth': 24, 'rf__min_samples_leaf': 1, 'rf__n_estimators': 34, 'xgb__colsample_bytree': 0.84, 'xgb__gamma': 0, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 7, 'xgb__min_child_weight': 4, 'xgb__reg_alpha': 9e-05, 'xgb__subsample': 0.85}
Accuracy: 0.8485 (+/- 0.0290) [eb]


In [65]:
pred = clf.fit(X_train, y_train).predict(X_test)
fname = 'titanic_submission3.csv'
submissions = pd.concat([df_test['PassengerId'], pd.Series(pred, name="Survived")] ,axis=1)
submissions.to_csv(fname, index=False)

submissions

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0
