In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
test_set = pd.read_csv("./datasets/titanic/test.csv")
train_set = pd.read_csv("./datasets/titanic/train.csv")

In [3]:
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
from sklearn.base import TransformerMixin , BaseEstimator

class DataFrameSelector(TransformerMixin , BaseEstimator):
    def __init__(self,attributes_names):
        self.attributes_names = attributes_names
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.attributes_names]
    

# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)
        

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

num_pipeline=Pipeline([
    ('num',DataFrameSelector(["Age","SibSp","Parch","Fare"])),
    ('imputer',SimpleImputer(strategy="mean"))
])

cat_pipeline=Pipeline([
    ('cat',DataFrameSelector(["Pclass","Sex","Embarked"])),
    ('cat_imputer',MostFrequentImputer()),
    ('encoder',OneHotEncoder(sparse=False))
    ])

In [7]:
from sklearn.pipeline import FeatureUnion
full_pipeline=FeatureUnion(transformer_list=[
    ('num',num_pipeline),
    ('cat',cat_pipeline)
])

In [8]:
train_set_prepared = full_pipeline.fit_transform(train_set)

In [9]:
test_set_prepared = full_pipeline.fit_transform(test_set)

In [10]:
y_train = train_set["Survived"]
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=200,random_state=42)
forest_clf.fit(train_set_prepared,y_train)

RandomForestClassifier(n_estimators=200, random_state=42)

In [12]:
from xgboost import XGBClassifier

xgbooss_clf = XGBClassifier()
xgbooss_clf.fit(train_set_prepared,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
def save_titanic_pred(y,test_set):
    prediction = pd.DataFrame(y,columns=["Survived"],index=list(test_set["PassengerId"]))
    prediction.index.names=['PassengerId']
    prediction.to_csv("_prediction1.csv")
    return prediction

In [14]:
y_test = forest_clf.predict(test_set_prepared)

In [15]:
y_test_xgboost = xgbooss_clf.predict(test_set_prepared)

In [16]:
save_titanic_pred(y_test,test_set)

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,1
896,0
...,...
1305,0
1306,1
1307,0
1308,0


In [17]:
save_titanic_pred(y_test_xgboost,test_set)

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
...,...
1305,0
1306,1
1307,0
1308,0



# Voting Classifier

In [20]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

forest_clf_vtg=RandomForestClassifier(n_estimators=100, random_state=0,criterion='entropy')
xgboost_clf_vtg=XGBClassifier()
svc_clf = SVC(probability=True)

voting_clf= VotingClassifier(
    estimators=[('fc',forest_clf_vtg),('xg', xgboost_clf_vtg),('svc',svc_clf)],
    voting='soft')
voting_clf.fit(train_set_prepared,y_train)



VotingClassifier(estimators=[('fc',
                              RandomForestClassifier(criterion='entropy',
                                                     random_state=0)),
                             ('xg',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                           

In [22]:
y_vtg_pred = voting_clf.predict(test_set_prepared) 
save_titanic_pred(y_vtg_pred , test_set)

AttributeError: predict_proba is not available when  probability=False