In [146]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

train = pd.read_csv('train.csv', index_col= ['PassengerId'])
test = pd.read_csv('test.csv', index_col = ['PassengerId'])

def drop_columns(df):
    for_dropping = ['Cabin', 'Name', 'Ticket']
    return df.drop(for_dropping, axis=1)
train, test = drop_columns(train), drop_columns(test)

def dummy_encoding(df): 
    for_encoding = ['Sex','Embarked']
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode())
    dummies = pd.get_dummies(df[for_encoding], dtype = int)
    df = df.drop(for_encoding, axis = 1)
    df = pd.concat([df, dummies], axis = 1)
    return df
train, test = dummy_encoding(train), dummy_encoding(test)

def impute_mean(df): 
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    return df
train, test = impute_mean(train), impute_mean(test)

def categorize_age(df):
    bins, labels = [0, 13, 18, 65, np.inf], [0, 1, 2, 3]
    df['Age_cat'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    return df
train, test = categorize_age(train), categorize_age(test)

def family_matters(df): 
    df['Family_size'] = df['SibSp'] + df['Parch'] + 1
    df['Alone'] = (df['Family_size'] == 1).astype(int)
    df['Avg_fare'] = df['Fare'] / df['Family_size']
    df['Poor'] = (df['Fare'] <= 8).astype(int)
    return df
train, test = family_matters(train), family_matters(test)

def survival_proba(df): 
    survival_proba_class = round((train.groupby('Pclass')['Survived'].sum() / train.groupby('Pclass').size()),4).to_dict()
    df['Survival_proba_class'] = df['Pclass'].map(survival_proba_class)
    
    survival_females_class = train[(train['Sex_female'] == 1) & (train['Survived'] ==1)].groupby('Pclass').size()
    total_females_class = train[train['Sex_female'] == 1].groupby('Pclass').size()
    proba_survival_females = round(survival_females_class / total_females_class, 4).to_dict()
    df['Survival_proba_females'] = 0.0
    df.loc[df['Sex_female'] == 1, 'Survival_proba_females'] = df.loc[df['Sex_female'] == 1, 'Pclass'].map(proba_survival_females)
    
    survival_male_class = train[(train['Sex_male'] == 1) & (train['Survived'] ==1)].groupby('Pclass').size()
    total_male_class = train[train['Sex_male'] == 1].groupby('Pclass').size()
    proba_survival_males = round(survival_male_class / total_male_class, 4)
    df['Survival_proba_males'] = 0.0
    df.loc[df['Sex_male'] == 1, 'Survival_proba_males'] = df.loc[df['Sex_male'] == 1, 'Pclass'].map(proba_survival_males)
    
    age_class_survived = train[train['Survived'] == 1].groupby('Age_cat', observed= False).size()
    total_age_class = train.groupby('Age_cat', observed= False).size()
    proba_survival_age_class = round(age_class_survived / total_age_class, 4).to_dict()
    df['Survival_proba_age_class'] = 0.0
    df['Survival_proba_age_class'] = df['Age_cat'].map(proba_survival_age_class)
        
    return df
train, test  = survival_proba(train), survival_proba(test)

display(train.head(3))
np.random.seed(1)

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Age_cat,Family_size,Alone,Avg_fare,Poor,Survival_proba_class,Survival_proba_females,Survival_proba_males,Survival_proba_age_class
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,3,22.0,1,0,7.25,0,1,0,0,1,2,2,0,3.625,1,0.2424,0.0,0.1354,0.3651
2,1,1,38.0,1,0,71.2833,1,0,1,0,0,2,2,0,35.64165,0,0.6296,0.9681,0.0,0.3651
3,1,3,26.0,0,0,7.925,1,0,0,0,1,2,1,1,7.925,1,0.2424,0.5,0.0,0.3651


In [147]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

X = train.drop('Survived', axis = 1).values
y = train['Survived'].values

X = MinMaxScaler(feature_range = (-1, 1)).fit_transform(X)
test_scaled = MinMaxScaler(feature_range= (-1, 1)).fit_transform(test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 1, shuffle = True)

In [148]:
#CLASSIFIER SELECTION

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

classifiers = [LogisticRegression(), SGDClassifier(), LinearSVC(), AdaBoostClassifier(algorithm= 'SAMME'), RandomForestClassifier(), 
               GradientBoostingClassifier(), GaussianProcessClassifier(), KNeighborsClassifier(), MLPClassifier(max_iter= 1000), DecisionTreeClassifier()]

print('--- CLASSIFIER SCORE ---')
best_score = -1
best_clf = None
for clf in classifiers: 
    clf.fit(X_train, y_train)
    score = round(clf.score(X_test, y_test), 4)
    print(f'{str(clf)}: {score}')
    if score > best_score: 
        best_score = score
        best_clf = clf
print()
print('--- OUTCOME ---')
print(f'Best classifier: {best_clf}\nBest score:{best_score}')

--- CLASSIFIER SCORE ---
LogisticRegression(): 0.7836
SGDClassifier(): 0.7649
LinearSVC(): 0.7761
AdaBoostClassifier(algorithm='SAMME'): 0.75
RandomForestClassifier(): 0.7649




GradientBoostingClassifier(): 0.7873
GaussianProcessClassifier(): 0.7649
KNeighborsClassifier(): 0.7799
MLPClassifier(max_iter=1000): 0.7761
DecisionTreeClassifier(): 0.7425

--- OUTCOME ---
Best classifier: GradientBoostingClassifier()
Best score:0.7873


#LOGISTIC REGRESSION GRID SEARCH

from sklearn.model_selection import KFold, GridSearchCV

cv = KFold(shuffle= True, random_state= 1)
logistic_param_grid = {'penalty' : ['l1','l2'], 
                       'tol': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
                       'C': np.arange(0.1, 1.01, 0.2), 'solver': ['liblinear'], 'multi_class': ['ovr'], 
                       'max_iter' : [100, 200, 500, 1000]}

logreg = LogisticRegression(random_state= 1)
logreg_cv = GridSearchCV(logreg, param_grid = logistic_param_grid, cv= cv)
logreg_cv.fit(X_train, y_train)

print(' --- OUTCOME ---')
print(f'Best params: {logreg_cv.best_params_}\nScore: {logreg_cv.score(X_test, y_test)}')

#ADA BOOST GRID SEARCH 

ada_param_grid = {'n_estimators': np.arange(51, 151, 50), 
                  'learning_rate': np.arange(0.1, 1.1, 0.3), 
                  'algorithm': ['SAMME']}

ada = AdaBoostClassifier(random_state= 1)
ada_cv = GridSearchCV(ada, param_grid= ada_param_grid, cv = cv, n_jobs= -1)
ada_cv.fit(X_train, y_train)

print(' --- OUTCOME ---')
print(f'Best params: {ada_cv.best_params_}\nScore: {ada_cv.score(X_test, y_test)}')

#MLP CLASSIFIER GRID SEARCH 

mlp_param_grid = {'hidden_layer_sizes': [(100,), (200,)],
                  'activation': ['relu', 'tanh', 'identity','logistic'],
                  'solver': ['adam'],
                  'alpha': [0.0001, 0.001],
                  'learning_rate': ['constant', 'adaptive']}

mlp = MLPClassifier(random_state= 1, max_iter= 2000)
mlp_cv = GridSearchCV(mlp, param_grid= mlp_param_grid, cv = cv)
mlp_cv.fit(X_train, y_train)

print(' --- OUTCOME ---')
print(f'Best params: {mlp_cv.best_params_}\nScore: {mlp_cv.score(X_test, y_test)}')

In [149]:
from sklearn.ensemble import VotingClassifier

best_logreg = LogisticRegression(C= 0.9, max_iter= 100, multi_class= 'ovr', 
                                  penalty = 'l2', solver= 'liblinear', tol = 1e-05, random_state= 1)
best_ada = AdaBoostClassifier(algorithm= 'SAMME', learning_rate= 0.7, n_estimators= 51)
best_mlp = MLPClassifier(activation= 'tanh', alpha= 0.001, hidden_layer_sizes= (200,),
                         learning_rate= 'constant', solver = 'adam', max_iter= 2000)

classifier_list = [('logreg', best_logreg), 
                   ('ada', best_ada),
                   ('mlp', best_mlp)]
voting = VotingClassifier(estimators= classifier_list, voting = 'soft')
voting.fit(X_train, y_train)
voting.score(X_test, y_test)

0.7798507462686567

In [151]:
final_voter = VotingClassifier(estimators= classifier_list, voting = 'soft')
final_voter.fit(X, y)
print(final_voter.score(X, y))
final_voter_pred = final_voter.predict(test_scaled)
final_voter_df = pd.DataFrame(final_voter_pred, index = test.index, columns= ['Survived'])
final_voter_df.to_csv('voter_classifier_predictions_v1.csv')

0.8428731762065096


final_logreg = LogisticRegression(C= 0.9, max_iter= 100, multi_class= 'ovr', 
                                  penalty = 'l2', solver= 'liblinear', tol = 1e-05, random_state= 1)

final_logreg.fit(X, y)
logreg_pred = final_logreg.predict(test_scaled)
logreg_df = pd.DataFrame(logreg_pred, index= test.index, columns= ['Survived'])
#logreg_df.to_csv('logreg_prediction.csv')

import tensorflow as tf
import tensorflow.keras.layers as tfl

input_shape = (X_train.shape[1],)

model = tf.keras.Sequential([ 
    tfl.Dense(32, activation = 'relu', input_shape = input_shape), 
    tfl.Dense(16, activation = 'relu'), 
    tfl.Dense(1, activation = 'sigmoid')
])

model.compile(optimizer = 'adam', 
              loss = tf.keras.losses.BinaryCrossentropy(), 
              metrics = [tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.FalseNegatives()])
model.fit(X_train, y_train, epochs = 150, verbose = 0, callbacks = tf.keras.callbacks.EarlyStopping(patience = 10))
pred_nn = model.predict(X_test)
pred_nn = (pred_nn >= 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, pred_nn)
print('Accuracy NN:', accuracy_nn)