In [1]:
import pandas as pd
import seaborn as sb
import numpy as np
import os
import sys
import warnings 
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO)

In [2]:
cwd=os.getcwd()
cwd

'C:\\Users\\perma\\Documents\\GitHub\\Kaggle_Titanic\\code'

In [3]:
train_path=os.path.join(os.path.dirname(cwd),"data/train_raw.csv")
test_path=os.path.join(os.path.dirname(cwd),"data/test_raw.csv")

In [4]:
train=pd.read_csv(train_path,header=0)
test=pd.read_csv(test_path,header=0)
train_test={"train":train,"test":test}
train.head(3)
test.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [5]:
#get Family_Size
for key in train_test:
    data=train_test[key]
    data["Family_Size"]=data['SibSp']+data["Parch"]+1

In [6]:
train_and_test=pd.concat([train,test],ignore_index=True)

In [7]:
family_survival=train[["Ticket","Survived"]].groupby(by=["Ticket"]).mean()

In [8]:
train_and_test["Family_Survival"]="median"
for index in train_and_test.index:
    if(train_and_test.loc[index,"Family_Size"]>1):
        ticket=train_and_test.loc[index,"Ticket"]
        if(ticket in family_survival.index):
            survival=family_survival.loc[ticket]
            if(survival.values>0.5):
                train_and_test.loc[index,"Family_Survival"]="high"
            else:
                train_and_test.loc[index,"Family_Survival"]="low"

In [9]:
train["Family_Survival"]=train_and_test.loc[0:890,"Family_Survival"].values

In [10]:
test["Family_Survival"]=train_and_test.loc[891:,"Family_Survival"].values

In [11]:
def get_dummies(df,*features):
    logging.info("---------------get_dummies-------------------")
    for feature in features:
        logging.info("{}:start".format(feature))
        dummies=pd.get_dummies(df[feature],prefix=feature)
        df=df.join(dummies)
        df.drop(feature,inplace=True,axis=1)
        logging.info("{}:done".format(feature))
    return df
        

In [12]:
feature_to_dummies=["Sex","Embarked","Family_Survival"]
for key in train_test:
    data=train_test[key]
    data=get_dummies(data,*feature_to_dummies)
    train_test[key]=data

2018-11-18 11:57:17,899:INFO:---------------get_dummies-------------------
2018-11-18 11:57:17,904:INFO:Sex:start
2018-11-18 11:57:17,909:INFO:Sex:done
2018-11-18 11:57:17,910:INFO:Embarked:start
2018-11-18 11:57:17,915:INFO:Embarked:done
2018-11-18 11:57:17,915:INFO:Family_Survival:start
2018-11-18 11:57:17,921:INFO:Family_Survival:done
2018-11-18 11:57:17,921:INFO:---------------get_dummies-------------------
2018-11-18 11:57:17,922:INFO:Sex:start
2018-11-18 11:57:17,926:INFO:Sex:done
2018-11-18 11:57:17,927:INFO:Embarked:start
2018-11-18 11:57:17,932:INFO:Embarked:done
2018-11-18 11:57:17,933:INFO:Family_Survival:start
2018-11-18 11:57:17,938:INFO:Family_Survival:done


In [13]:
train_test["train"].columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Family_Size', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Family_Survival_high',
       'Family_Survival_low', 'Family_Survival_median'],
      dtype='object')

In [14]:
features=[ 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Family_Size', 'Sex_female', 'Sex_male',\
'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Family_Survival_high',\
'Family_Survival_low', 'Family_Survival_median']
target=["Survived"]
X_train=train_test["train"][features].values
y=train_test["train"][target].values

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator,ClassifierMixin,TransformerMixin,clone
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier

In [16]:
dt=DecisionTreeClassifier()
scores=cross_val_score(dt,X=X_train,y=y,cv=5)
np.mean(scores)

0.8057973882424454

In [17]:
class StackingModel(BaseEstimator,ClassifierMixin,TransformerMixin):
    def __init__(self,basemodels,metamodel,cv):
        self.basemodels=basemodels
        self.metamodel=metamodel
        self.cv=cv
    def fit(self,X,y):
        self.basemodels_=[list() for model in self.basemodels]
        out_of_fold=np.zeros((X.shape[0],len(self.basemodels)))
        kf=KFold(n_splits=self.cv,shuffle=True,random_state=86)
        for i,model in enumerate(self.basemodels):
            for train_index,test_index in kf.split(X,y):
                clone_=clone(model)
                clone_.fit(X[train_index],y[train_index])
                out=clone_.predict(X[test_index])
                out_of_fold[test_index,i]=out
                self.basemodels_[i].append(clone_)
        self.metamodel_=clone(self.metamodel)
        self.metamodel_.fit(out_of_fold,y)
        return self
    def __vote(self,votes_list):
        l=[]
        for votes in votes_list:
            v_=map(lambda x:1 if x>0 else -1,votes)
            vote=0
            for i in v_:
                vote+=i
            if(vote>0):
                l.append(1)
            else:
                l.append(0)
        return l
    def predict(self,X):
        out_of_fold=np.zeros((X.shape[0],len(self.basemodels)))
        for i,basemodels in enumerate(self.basemodels_):
            prediction=np.zeros((X.shape[0],len(basemodels)))
            for j,model in enumerate(basemodels):
                prediction[:,j]=model.predict(X)
            out_of_fold[:,i]=self.__vote(prediction)
        return self.metamodel_.predict(out_of_fold)   

In [18]:
rf=RandomForestClassifier(100)
bg=GradientBoostingClassifier(n_estimators=100)
xg=xgb.XGBClassifier(n_estimators=100)
tree=DecisionTreeClassifier()
stacking=StackingModel([rf,bg,xg],tree,cv=5)
scores=cross_val_score(estimator=stacking,X=X_train,y=y,scoring='accuracy',cv=5)
np.mean(scores)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.8686883441905364

In [19]:
rf=RandomForestClassifier(100)
bg=GradientBoostingClassifier(n_estimators=100)
xg=xgb.XGBClassifier(n_estimators=100)
vclf=VotingClassifier([('rf',rf),("bg",bg),("xg",xg)])
scores=cross_val_score(vclf,X=X_train,y=y,cv=10)
np.mean(scores)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.869784076722279

In [21]:
np.sum(train[target]==1)
np.sum(train[target]==0)

Survived    342
dtype: int64

Survived    549
dtype: int64

In [26]:
np.random.randint(1)

0