In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style="ticks")
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV,cross_validate
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, StandardScaler, MinMaxScaler, scale,PowerTransformer,RobustScaler
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier,StackingClassifier)
import re
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from pyod.models.knn import KNN

from xgboost import XGBClassifier

In [None]:
train_df=pd.read_csv("titanic_train.csv",index_col="PassengerId")
test_df=pd.read_csv("titanic_test.csv",index_col="PassengerId")
all_df=pd.concat([train_df,test_df])

In [382]:
#Insert a Deck column indicating the deck of the passenger, we take this info from Cabin column
all_df['Cabin'] = all_df['Cabin'].fillna("U0")
all_df['Deck'] = all_df['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
#Insert a Family column, indicating the family group size for each passenger, we will do it by adding the "SibSp" and "Parch" columns
all_df["Family"]=all_df["SibSp"]+all_df["Parch"]
#According to an investigation on the internet, both passenger with missing "embarked" boarded in Southhampton
all_df["Embarked"]=all_df["Embarked"].fillna("S")

In [383]:
#Estimate the age of the missing passengers by grouping by Pclass,embarked and sex and taking the median value
all_df["Age"]=all_df["Age"].fillna(all_df.groupby(["Pclass","Embarked","Sex"])["Age"].transform(lambda x:x.median()))
#Estimate the fare of the missing passengers by grouping by Pclass,embarked and sex and taking the median value
all_df["Fare"]=all_df["Fare"].fillna(all_df.groupby(["Pclass","Embarked","Sex"])["Fare"].transform(lambda x:x.median()))

In [384]:
#From the name of the passenger, we extract the title
all_df['Title'] = all_df['Name'].map(lambda x: re.compile("(\w+\.)").search(x).group())

In [385]:
#Replace some titles to decrease the variation of the samples
title_sub=['Mr.', 'Miss.', 'Mrs.', 'Master.', 'Rev.', 'Dr.', 'Mr.', 'Mrs.',
       'Mr.', 'Mr.', 'Miss.', 'Mr.', 'Mrs.', 'Rare', 'Rare',
       'Rare', 'Rare', 'Rare']

dictionary=dict(zip(all_df["Title"].value_counts().index,title_sub))

all_df["Title"]=all_df["Title"].map(dictionary)

In [386]:
#Drop the columns that wont be use: "NAME","SIBSP","PARCH","CABIN"
all_df=all_df.drop(columns=["Name","SibSp","Parch","Cabin"])

In [387]:
#Divide the data in X and Y
all_X=all_df.drop(columns="Survived")
all_y=all_df["Survived"]

In [388]:
all_X

Unnamed: 0_level_0,Pclass,Sex,Age,Ticket,Fare,Embarked,Deck,Family,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,male,22.00,A/5 21171,7.2500,S,U,1,Mr.
2,1,female,38.00,PC 17599,71.2833,C,C,1,Mrs.
3,3,female,26.00,STON/O2. 3101282,7.9250,S,U,0,Miss.
4,1,female,35.00,113803,53.1000,S,C,1,Mrs.
5,3,male,35.00,373450,8.0500,S,U,0,Mr.
...,...,...,...,...,...,...,...,...,...
1305,3,male,25.00,A.5. 3236,8.0500,S,U,0,Mr.
1306,1,female,39.00,PC 17758,108.9000,C,C,0,Rare
1307,3,male,38.50,SOTON/O.Q. 3101262,7.2500,S,U,0,Mr.
1308,3,male,25.00,359309,8.0500,S,U,0,Mr.


In [389]:
def divide_cat_num_data(X):
    cat_data=[]
    num_data=[]

    for i,cat in enumerate (X.dtypes):
        if cat==object:
            cat_data.append(X.iloc[:,i])
        else:
            num_data.append(X.iloc[:,i])

    cat_data=pd.DataFrame(cat_data).transpose()
    num_data=pd.DataFrame(num_data).transpose()
    return cat_data,num_data

cat_data,num_data=divide_cat_num_data(all_X)

In [390]:
num_data

Unnamed: 0_level_0,Pclass,Age,Fare,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.0,22.00,7.2500,1.0
2,1.0,38.00,71.2833,1.0
3,3.0,26.00,7.9250,0.0
4,1.0,35.00,53.1000,1.0
5,3.0,35.00,8.0500,0.0
...,...,...,...,...
1305,3.0,25.00,8.0500,0.0
1306,1.0,39.00,108.9000,0.0
1307,3.0,38.50,7.2500,0.0
1308,3.0,25.00,8.0500,0.0


#Box Cox Transformation of (highly) skewed features

from scipy.stats import skew,boxcox
from scipy.special import boxcox1p

cats=[]

for i in num_data:
    skewness=skew(num_data[i])
    
    if abs(skewness)>0.75:
        num_data[i],lam = boxcox(num_data[i]+1)
        cats.append(i)
        
print("There were {} features skewed that were corrected, these features are:".format(len(cats)))
print(cats)

In [391]:
#scalerx=PowerTransformer()

#num_data_s=pd.DataFrame(scalerx.fit_transform(num_data),columns=num_data.columns)

In [392]:
cat_data.drop(columns="Ticket",inplace=True)

cat_data_num=pd.get_dummies(cat_data)

In [393]:
cat_data_num

Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,...,Deck_G,Deck_T,Deck_U,Title_Dr.,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_Rare,Title_Rev.
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1306,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1307,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1308,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [394]:
all_X_final=pd.merge(cat_data_num,num_data,on=None,how="inner", left_index=True, right_index=True)


In [395]:
all_X_final

Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,...,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_Rare,Title_Rev.,Pclass,Age,Fare,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,3.0,22.00,7.2500,1.0
2,1,0,1,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1.0,38.00,71.2833,1.0
3,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,3.0,26.00,7.9250,0.0
4,1,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,1.0,35.00,53.1000,1.0
5,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,3.0,35.00,8.0500,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,3.0,25.00,8.0500,0.0
1306,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1.0,39.00,108.9000,0.0
1307,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,3.0,38.50,7.2500,0.0
1308,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,3.0,25.00,8.0500,0.0


In [396]:
X_train=all_X_final.iloc[0:train_df.shape[0],:]
X_test=all_X_final.iloc[-test_df.shape[0]:,:]
y_train=all_y.iloc[0:train_df.shape[0]].astype(int)

In [397]:
# Create 4 objects that represent our models
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
ada = AdaBoostClassifier()
gb = GradientBoostingClassifier()

In [398]:
estimators=[rf,et,ada,gb,svc]

In [None]:
#lets tune the random forest classificator

params_rfc={"n_estimators":np.arange(20,300,10),
            "criterion":["gini","entropy"],
           "min_samples_split":np.arange(2,6),
           "min_samples_leaf":np.arange(1,5)
           }

rf=RandomizedSearchCV(rf,param_distributions=params_rfc,n_jobs=-1,
                      scoring="accuracy")
rf.fit(X_train,y_train)

print(rf.best_estimator_)
print(rf.best_score_)

In [None]:
#lets tune the extra trees classifier
params_et={"n_estimators":np.arange(20,300,10),
            "criterion":["gini","entropy"],
           "min_samples_split":np.arange(2,6),
           "min_samples_leaf":np.arange(1,5)
           }

et=RandomizedSearchCV(et,param_distributions=params_et,n_jobs=-1,
                      scoring="accuracy")
et.fit(X_train,y_train)

print(et.best_estimator_)
print(et.best_score_)

In [None]:
#lets tune the ADA classifier

params_ada={"n_estimators":np.arange(5,120,8),
           "learning_rate":np.arange(0.05,1.05,0.05),
           }

ada=RandomizedSearchCV(ada,param_distributions=params_ada,n_jobs=-1,
                      scoring="accuracy")
et.fit(X_train,y_train)

print(ada.best_estimator_)
print(ada.best_score_)

In [None]:
#lets tune the GB classifier
params_gb={"learning_rate":np.arange(0.01,0.5,0.02),
        "n_estimators":np.arange(50,300,20),
           "min_samples_split":np.arange(2,6),
           "min_samples_leaf":np.arange(1,4)
           }


gb=RandomizedSearchCV(gb,param_distributions=params_gb,n_jobs=-1,
                      scoring="accuracy")
gb.fit(X_train,y_train)


print(gb.best_estimator_)
print(gb.best_score_)

In [399]:
for i in estimators:
    scores=cross_validate(i,X_train,y_train,scoring="accuracy")
    print("The accuracy of {} is {}".format(i,scores["test_score"].mean()))

The accuracy of RandomForestClassifier() is 0.8080911430544221
The accuracy of ExtraTreesClassifier() is 0.7923733601154981
The accuracy of AdaBoostClassifier() is 0.8181972255351202
The accuracy of GradientBoostingClassifier() is 0.8406440273680247
The accuracy of SVC() is 0.6746155294708431


In [400]:
stack=StackingClassifier(estimators=[("rfc",rf),
                                    ("etc",et),
                                    ("ada",ada),
                                    ("gbc",gb)],
                        final_estimator=XGBClassifier())

In [401]:
stack.fit(X_train,y_train)

StackingClassifier(estimators=[('rfc', RandomForestClassifier()),
                               ('etc', ExtraTreesClassifier()),
                               ('ada', AdaBoostClassifier()),
                               ('gbc', GradientBoostingClassifier())],
                   final_estimator=XGBClassifier())

In [402]:
y_pred=stack.predict(X_test)

In [403]:
scores=cross_validate(stack,X_train,y_train,scoring="accuracy")

In [404]:
scores["test_score"].mean()

0.822666499278137

In [405]:
prediction=pd.DataFrame()
prediction['PassengerId'] = X_test.index.values
prediction['Survived'] = y_pred
prediction.to_csv('prediction.csv',index=False)