In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
test_set = pd.read_csv("./datasets/titanic/test.csv")
train_set = pd.read_csv("./datasets/titanic/train.csv")

In [3]:
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
from sklearn.base import TransformerMixin , BaseEstimator

class DataFrameSelector(TransformerMixin , BaseEstimator):
    def __init__(self,attributes_names):
        self.attributes_names = attributes_names
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.attributes_names]
    

# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)
        

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

num_pipeline=Pipeline([
    ('num',DataFrameSelector(["Age","SibSp","Parch","Fare"])),
    ('imputer',SimpleImputer(strategy="mean"))
])

cat_pipeline=Pipeline([
    ('cat',DataFrameSelector(["Pclass","Sex","Embarked"])),
    ('cat_imputer',MostFrequentImputer()),
    ('encoder',OneHotEncoder(sparse=False))
    ])

In [7]:
from sklearn.pipeline import FeatureUnion
full_pipeline=FeatureUnion(transformer_list=[
    ('num',num_pipeline),
    ('cat',cat_pipeline)
])

In [8]:
train_set_prepared = full_pipeline.fit_transform(train_set)

In [9]:
test_set_prepared = full_pipeline.fit_transform(test_set)

In [10]:
y_train = train_set["Survived"]
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
# from sklearn.ensemble import RandomForestClassifier

# forest_clf = RandomForestClassifier(n_estimators=200,random_state=42)
# forest_clf.fit(train_set_prepared,y_train)

In [12]:
# from xgboost import XGBClassifier

# xgbooss_clf = XGBClassifier()
# xgbooss_clf.fit(train_set_prepared,y_train)

In [13]:
def save_titanic_pred(y,test_set):
    prediction = pd.DataFrame(y,columns=["Survived"],index=list(test_set["PassengerId"]))
    prediction.index.names=['PassengerId']
    prediction.to_csv("_prediction1.csv")
    return prediction

In [14]:
# y_test = forest_clf.predict(test_set_prepared)

In [15]:
# y_test_xgboost = xgbooss_clf.predict(test_set_prepared)

In [16]:
# save_titanic_pred(y_test,test_set)

In [17]:
# save_titanic_pred(y_test_xgboost,test_set)


# Voting Classifier

In [18]:
# from sklearn.ensemble import VotingClassifier
# from sklearn.svm import SVC

# forest_clf_vtg=RandomForestClassifier(n_estimators=100, random_state=0,criterion='entropy')
# xgboost_clf_vtg=XGBClassifiaer()
# svc_clf = SVC(probability=True)

# voting_clf= VotingClassifier(
#     estimators=[('fc',forest_clf_vtg),('xg', xgboost_clf_vtg),('svc',svc_clf)],
#     voting='soft')
# voting_clf.fit(train_set_prepared,y_train)



In [19]:
# y_vtg_pred = voting_clf.predict(test_set_prepared) 
# save_titanic_pred(y_vtg_pred , test_set)

# Neural Network

In [20]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
train_set_prepared_std1 = std.fit_transform(train_set_prepared)
train_set_prepared_std1

array([[-5.92480600e-01,  4.32793366e-01, -4.73673609e-01, ...,
        -4.82042680e-01, -3.07562343e-01,  6.15838425e-01],
       [ 6.38789012e-01,  4.32793366e-01, -4.73673609e-01, ...,
         2.07450510e+00, -3.07562343e-01, -1.62380254e+00],
       [-2.84663197e-01, -4.74545196e-01, -4.73673609e-01, ...,
        -4.82042680e-01, -3.07562343e-01,  6.15838425e-01],
       ...,
       [ 4.37434839e-15,  4.32793366e-01,  2.00893337e+00, ...,
        -4.82042680e-01, -3.07562343e-01,  6.15838425e-01],
       [-2.84663197e-01, -4.74545196e-01, -4.73673609e-01, ...,
         2.07450510e+00, -3.07562343e-01, -1.62380254e+00],
       [ 1.77062908e-01, -4.74545196e-01, -4.73673609e-01, ...,
        -4.82042680e-01,  3.25137334e+00, -1.62380254e+00]])

In [21]:
import tensorflow as tf

In [22]:
ann = tf.keras.models.Sequential()

In [23]:
ann.add(tf.keras.layers.Dense(units=128,activation="relu"))
ann.add(tf.keras.layers.Dense(units=64,activation="relu"))
ann.add(tf.keras.layers.Dense(units=32,activation="relu"))
ann.add(tf.keras.layers.Dense(units=16,activation="relu"))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [24]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [25]:
ann.fit(train_set_prepared_std1, y_train, batch_size = 32, epochs = 200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 15

Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x18ed3db0fa0>

In [28]:
def boolstr_to_floatstr(v):
    if v == 'True':
        return '1'
    elif v == 'False':
        return '0'
    else:
        return v

In [30]:
ann_predict1 = ann.predict(std.transform(test_set_prepared))

In [31]:
ann_predict1 = (ann_predict1>0.5)
new_data = np.vectorize(boolstr_to_floatstr)(ann_predict1).astype(int)

In [32]:
new_data

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
    

In [33]:
save_titanic_pred(new_data,test_set)

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
...,...
1305,0
1306,1
1307,0
1308,0
