In [16]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

train = pd.read_csv('train.csv', index_col= ['PassengerId'])
test = pd.read_csv('test.csv', index_col = ['PassengerId'])

def drop_columns(df):
    for_dropping = ['Cabin', 'Name', 'Ticket']
    return df.drop(for_dropping, axis=1)
train, test = drop_columns(train), drop_columns(test)

def dummy_encoding(df): 
    for_encoding = ['Sex','Embarked']
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode())
    dummies = pd.get_dummies(df[for_encoding], dtype = int)
    df = df.drop(for_encoding, axis = 1)
    df = pd.concat([df, dummies], axis = 1)
    return df
train, test = dummy_encoding(train), dummy_encoding(test)

def impute_mean(df): 
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    return df
train, test = impute_mean(train), impute_mean(test)

def categorize_age(df):
    bins, labels = [0, 10, 18, 65, np.inf], [0, 1, 2, 3]
    df['Age_cat'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    return df
train, test = categorize_age(train), categorize_age(test)

def family_matters(df): 
    df['Family_size'] = df['SibSp'] + df['Parch'] + 1
    df['Alone'] = (df['Family_size'] == 1).astype(int)
    df['Avg_fare'] = df['Fare'] / df['Family_size']
    df['Poor'] = (df['Fare'] <= 8).astype(int)
    return df
train, test = family_matters(train), family_matters(test)

display(train.head(3))
np.random.seed(1)

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Age_cat,Family_size,Alone,Avg_fare,Poor
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,22.0,1,0,7.25,0,1,0,0,1,2,2,0,3.625,1
2,1,1,38.0,1,0,71.2833,1,0,1,0,0,2,2,0,35.64165,0
3,1,3,26.0,0,0,7.925,1,0,0,0,1,2,1,1,7.925,1


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

X = train.drop('Survived', axis = 1).values
y = train['Survived'].values

X = MinMaxScaler(feature_range = (-1, 1)).fit_transform(X)
test_scaled = MinMaxScaler(feature_range= (-1, 1)).fit_transform(test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 1, shuffle = True)

In [18]:
#CLASSIFIER SELECTION

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

classifiers = [LogisticRegression(), AdaBoostClassifier(algorithm= 'SAMME'), RandomForestClassifier(), GaussianProcessClassifier(),
               KNeighborsClassifier(), MLPClassifier(max_iter= 1000), DecisionTreeClassifier()]

print('--- CLASSIFIER SCORE ---')
best_score = -1
best_clf = None
for clf in classifiers: 
    clf.fit(X_train, y_train)
    score = round(clf.score(X_test, y_test), 4)
    print(f'{str(clf)}: {score}')
    if score > best_score: 
        best_score = score
        best_clf = clf
print()
print('--- OUTCOME ---')
print(f'Best classifier: {best_clf}\nBest score:{best_score}')

--- CLASSIFIER SCORE ---
LogisticRegression(): 0.7873
AdaBoostClassifier(algorithm='SAMME'): 0.7799
RandomForestClassifier(): 0.7649
GaussianProcessClassifier(): 0.7649
KNeighborsClassifier(): 0.7836
MLPClassifier(max_iter=1000): 0.7799
DecisionTreeClassifier(): 0.7537

--- OUTCOME ---
Best classifier: LogisticRegression()
Best score:0.7873


In [19]:
#GRID SEARCH LOGISTIC REGRESSION 
from 

SyntaxError: invalid syntax (2673750642.py, line 2)

import tensorflow as tf
import tensorflow.keras.layers as tfl

input_shape = (X_train.shape[1],)

model = tf.keras.Sequential([ 
    tfl.Dense(32, activation = 'relu', input_shape = input_shape), 
    tfl.Dense(16, activation = 'relu'), 
    tfl.Dense(1, activation = 'sigmoid')
])

model.compile(optimizer = 'adam', 
              loss = tf.keras.losses.BinaryCrossentropy(), 
              metrics = [tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.FalseNegatives()])
model.fit(X_train, y_train, epochs = 150, verbose = 0, callbacks = tf.keras.callbacks.EarlyStopping(patience = 10))
pred_nn = model.predict(X_test)
pred_nn = (pred_nn >= 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, pred_nn)
print('Accuracy NN:', accuracy_nn)