In [1]:
#Imports
#!pip install scikit-learn==0.21.2
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
pd.set_option('max_columns', None, "max_rows", None)
from numpy.random import seed
seed(1002)
tf.random.set_seed(1002)

Using TensorFlow backend.


In [2]:
#Declare Global variables 
train_path = "../input/titanic/train.csv"
test_path = "../input/titanic/test.csv"
mapping = {'Col': 'Other',
           'Major': 'Other',
           'Ms': 'Miss',
           'Mlle': 'Miss',
           'Sir': 'Royal',
           'Jonkheer': 'Royal',
           'Countess': 'Royal',
           'Lady': 'Royal',
           'Capt': 'Other',
           'Dona': 'Royal',
           'Mme': 'Mrs',
           'Don': 'Royal',
           'Dr': 'Other',
           'Rev' : 'Other'}
continuous = ['Age', 'Fare', 'Parch', 'SibSp', 'Family_Size', "Family_Survival"]

In [3]:
def prepare_data(train_path,test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    df = pd.concat([train, test], axis = 0, sort = False)
    df["Title"] = df["Name"].str.extract(r"([a-zA-Z]+)\.", expand = True)
    df.replace({"Title": mapping}, inplace = True)
    title_ages = dict(df.groupby('Title')['Age'].median())
    df["age_med"] = df["Title"].apply(lambda a : title_ages[a])
    df["Age"].fillna(df["age_med"], inplace = True)
    #df["Pclass_rel"] = df["Pclass"]
    submit = df[pd.isnull(df["Survived"])][["PassengerId","Survived"]]
    df["Fare"].fillna(df["Fare"][df["Pclass"] == 3].median(), inplace = True)
    df['Family_Size'] = df['Parch'] + df['SibSp'] + 1
    df.loc[:,'FsizeD'] = 'Alone'
    df.loc[(df['Family_Size'] > 1),'FsizeD'] = 'Small'
    df.loc[(df['Family_Size'] > 4),'FsizeD'] = 'Big'
    # Family Survival (https://www.kaggle.com/konstantinmasich/titanic-0-82-0-83)
    df['Last_Name'] = df['Name'].apply(lambda x: str.split(x, ",")[0])
    DEFAULT_SURVIVAL_VALUE = 0.5
    df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
    for grp, grp_df in df[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId', 
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
        if (len(grp_df) != 1):
            # A Family group is found.
            for ind, row in grp_df.iterrows():
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin == 0.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0
    for _, grp_df in df.groupby('Ticket'):
        if (len(grp_df) != 1):
            for ind, row in grp_df.iterrows():
                if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                    smax = grp_df.drop(ind)['Survived'].max()
                    smin = grp_df.drop(ind)['Survived'].min()
                    passID = row['PassengerId']
                    if (smax == 1.0):
                        df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
                    elif (smin == 0.0):
                        df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0
    df['Embarked'].fillna(method='backfill', inplace=True)
    df['Sex'] = df['Sex'].astype('category').cat.codes
    df.drop(['Cabin', 'Name', 'Ticket', 'PassengerId', 'age_med', 'Last_Name'], axis=1, inplace=True)
    df = pd.get_dummies(df, columns = ["Embarked", "Pclass", "Title", "FsizeD"])
    scaler = StandardScaler()
    for var in continuous:
        df[var] = scaler.fit_transform(df[var].astype('float64').values.reshape(-1, 1))
    x_train = df[pd.notnull(df["Survived"])].drop("Survived",axis = 1)
    y_train = df[pd.notnull(df["Survived"])]["Survived"].astype(int)
    x_test = df[pd.isnull(df["Survived"])].drop("Survived",axis = 1)
    return x_train, y_train, x_test, submit

In [4]:
x_train, y_train, x_test, submit = prepare_data(train_path, test_path)

In [5]:
layers = [[8],[16],[8,4],[16,8],[24,16,8],[24,8]]
activation = ["relu","linear","tanh"]
optimizer = ["SGD","RMSprop","Adam"]
dropout = [0.0,0.2]
batch_size = [32,64,128]
epochs = [50,75]
param_grid = dict(batch_size = batch_size, 
                  epochs = epochs,
                  lyr = layers,
                  act = activation,
                  opt = optimizer, 
                  dr = dropout)

In [6]:
def create_model(lyr = [13,8], act = "relu", opt = "adam", dr = 0.2):
    model = Sequential()
    model.add(Dense(lyr[0], input_dim = 22, activation = act))
    model.add(Dropout(dr))
    for i in lyr[1:]:
        model.add(Dense(i, activation = act))
    model.add(Dense(1, activation = "sigmoid"))
    model.compile(loss = "binary_crossentropy", optimizer = opt, metrics = ["accuracy"])
    return model

In [7]:
def search():
  model = KerasClassifier(build_fn = create_model, verbose = 1)
  grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs = -1, cv = 2, verbose=2)
  grid_result = grid.fit(x_train, y_train)
  return grid_result

**Now that we have done all this,**
<br>
<br>all we have to now do is just:
<br>call the search method and store the result.
<br>use the result to see the best parameter
<br>
<br>**BUT**
<br>
<br>The current kernal in kaggle uses the scikit-learn version 0.23.0
<br>This version of Scikit-learn has got a bug in the GridSearchCV
<br>and the only way to fix this is to roll back on a more stable release where 
<br>GridSearchCV is working ( I choose v0.21.2)
<br>
<br>**BUT**
<br>
<br>To roll to this version, i need to use conda inside of kaggle and that thing is also bugged out rn
<br>
<br>So it has become a rabbit hole of tryint to fix GridSearchCV -> Scikit-learn -> Conda installs
<br>where I have already spent so many hours into
<br>
<br>**THEREFORE**
<br>
<br>I ran the GridSearchCV locally and stored its results in a dataframe
<br>
<br>Below are the output of codes that I ran locally:
<br>
> ```#codes to be run if GridSearchCV() is running properly
> search_result = search()
> view = pd.DataFrame(search_result.cv_results_)
> view.to_csv('gridsearch_cv_results.csv', index=False)
> search_result.best_params_```
> 
---
> search_result = search()
<br>
```Fitting 2 folds for each of 648 candidates, totalling 1296 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks        | elapsed:   41.0s
[Parallel(n_jobs=-1)]: Done 158 tasks        | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 361 tasks        | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 644 tasks        | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 1009 tasks       | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed: 20.1min finished```

> search_result
<br>
```GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x7f79f41f2550>,
             iid='warn', n_jobs=-1,
             param_grid={'act': ['relu', 'linear', 'tanh'],
                         'batch_size': [32, 64, 128], 'dr': [0.0, 0.2],
                         'epochs': [50, 75],
                         'lyr': [[8], [16], [8, 4], [16, 8], [24, 16, 8],
                                 [24, 8]],
                         'opt': ['SGD', 'RMSprop', 'Adam']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)```

> search_result.best_params_
<br>
```{'act': 'tanh',
 'batch_size': 32,
 'dr': 0.0,
 'epochs': 75,
 'lyr': [16, 8],
 'opt': 'RMSprop'}```

---

**Some different samples of batch size and their co-responding acc**
<br>batch 32 : 85.07%
<br>batch 16 : 85.41%
<br>batch 1 : 85.63%
<br>other 
<br>#Results: 84.06% (1.45%) <13,8,1>
<br>#Results: 84.29% (1.87%) <13,8,1> with dr 0.2
<br>#Results: 84.40% (1.72%) <13,8,1> with dr 0.2 epochs = 100 (choosen)
<br>#Results: 84.17% (0.90%) <9,9,5> with dr 0.2 epochs = 100

---

In [8]:
estimator = KerasClassifier(build_fn = create_model, epochs = 100, batch_size = 10, verbose = 0)
kfold = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = False)
results = cross_val_score(estimator, x_train, y_train, cv = kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))



Results: 82.94% (2.32%)


In [9]:
estimator.fit(x_train, y_train, epochs = 100, batch_size = 10)

<tensorflow.python.keras.callbacks.History at 0x7fc66c25dc90>

The Ultimate oneliner

---
```submit["Survived"] = [int(np.round(best_nn.predict(x_test.loc[x].to_numpy().reshape(1,18)),0)) for x in range(0,submit["Survived"].size)]```

---

In [10]:
submit["Survived"] = estimator.predict(x_test)
submit["Survived"] = [int(np.round(x,0)) for x in submit["Survived"]]
submit.to_csv('predictions.csv', index=False)
submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
