In [6]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

train = pd.read_csv('train.csv', index_col= ['PassengerId'])
test = pd.read_csv('test.csv', index_col = ['PassengerId'])

def drop_columns(df):
    for_dropping = ['Cabin', 'Name', 'Ticket']
    return df.drop(for_dropping, axis=1)
train, test = drop_columns(train), drop_columns(test)

def dummy_encoding(df): 
    for_encoding = ['Sex','Embarked']
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode())
    dummies = pd.get_dummies(df[for_encoding], dtype = int)
    df = df.drop(for_encoding, axis = 1)
    df = pd.concat([df, dummies], axis = 1)
    return df
train, test = dummy_encoding(train), dummy_encoding(test)

def impute_mean(df): 
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    return df
train, test = impute_mean(train), impute_mean(test)

def categorize_age(df):
    bins, labels = [0, 13, 18, 65, np.inf], [0, 1, 2, 3]
    df['Age_cat'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    return df
train, test = categorize_age(train), categorize_age(test)

def family_matters(df): 
    df['Family_size'] = df['SibSp'] + df['Parch'] + 1
    df['Alone'] = (df['Family_size'] == 1).astype(int)
    df['Avg_fare'] = df['Fare'] / df['Family_size']
    df['Poor'] = (df['Fare'] <= 8).astype(int)
    return df
train, test = family_matters(train), family_matters(test)

def survival_proba(df): 
    survival_proba_class = round((train.groupby('Pclass')['Survived'].sum() / train.groupby('Pclass').size()),4).to_dict()
    df['Survival_proba_class'] = df['Pclass'].map(survival_proba_class)
    
    survival_females_class = train[(train['Sex_female'] == 1) & (train['Survived'] ==1)].groupby('Pclass').size()
    total_females_class = train[train['Sex_female'] == 1].groupby('Pclass').size()
    proba_survival_females = round(survival_females_class / total_females_class, 4).to_dict()
    df['Survival_proba_females'] = 0.0
    df.loc[df['Sex_female'] == 1, 'Survival_proba_females'] = df.loc[df['Sex_female'] == 1, 'Pclass'].map(proba_survival_females)
    
    survival_male_class = train[(train['Sex_male'] == 1) & (train['Survived'] ==1)].groupby('Pclass').size()
    total_male_class = train[train['Sex_male'] == 1].groupby('Pclass').size()
    proba_survival_males = round(survival_male_class / total_male_class, 4)
    df['Survival_proba_males'] = 0.0
    df.loc[df['Sex_male'] == 1, 'Survival_proba_males'] = df.loc[df['Sex_male'] == 1, 'Pclass'].map(proba_survival_males)
    
    age_class_survived = train[train['Survived'] == 1].groupby('Age_cat', observed= False).size()
    total_age_class = train.groupby('Age_cat', observed= False).size()
    proba_survival_age_class = round(age_class_survived / total_age_class, 4).to_dict()
    df['Survival_proba_age_class'] = 0.0
    df['Survival_proba_age_class'] = df['Age_cat'].map(proba_survival_age_class)
        
    return df
train, test  = survival_proba(train), survival_proba(test)

display(train.head(3))
np.random.seed(1)

AttributeError: module 'matplotlib' has no attribute 'get_data_path'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

X = train.drop('Survived', axis = 1).values
y = train['Survived'].values

X = MinMaxScaler(feature_range = (-1, 1)).fit_transform(X)
test_scaled = MinMaxScaler(feature_range= (-1, 1)).fit_transform(test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 1, shuffle = True)

In [None]:
#CLASSIFIER SELECTION

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

classifiers = [LogisticRegression(random_state= 1), SGDClassifier(random_state= 1), LinearSVC(dual= 'auto', random_state= 1), 
               AdaBoostClassifier(algorithm= 'SAMME', random_state= 1), RandomForestClassifier(random_state= 1), 
               GradientBoostingClassifier(random_state= 1), GaussianProcessClassifier(random_state= 1), KNeighborsClassifier(),
               MLPClassifier(max_iter= 1000, random_state= 1), DecisionTreeClassifier(random_state= 1)]

print('--- CLASSIFIER SCORE ---')
classifier_dict = dict()
for clf in classifiers: 
    clf.fit(X_train, y_train)
    score = round(clf.score(X_test, y_test), 4)
    classifier_dict[str(clf)] = score

score_df = pd.DataFrame(list(classifier_dict.items()), columns= ['Classifier','Score']).sort_values(by = 'Score', ascending= False)
score_df

--- CLASSIFIER SCORE ---


Unnamed: 0,Classifier,Score
5,GradientBoostingClassifier(random_state=1),0.7873
0,LogisticRegression(random_state=1),0.7836
7,KNeighborsClassifier(),0.7799
8,"MLPClassifier(max_iter=1000, random_state=1)",0.7799
2,"LinearSVC(dual='auto', random_state=1)",0.7761
1,SGDClassifier(random_state=1),0.7649
4,RandomForestClassifier(random_state=1),0.7649
6,GaussianProcessClassifier(random_state=1),0.7649
3,"AdaBoostClassifier(algorithm='SAMME', random_s...",0.75
9,DecisionTreeClassifier(random_state=1),0.7388


#LOGISTIC REGRESSION GRID SEARCH

from sklearn.model_selection import KFold, GridSearchCV

cv = KFold(shuffle= True, random_state= 1)
logistic_param_grid = {'penalty' : ['l1','l2'], 
                       'tol': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
                       'C': np.arange(0.1, 1.01, 0.2), 'solver': ['liblinear'], 'multi_class': ['ovr'], 
                       'max_iter' : [100, 200, 500, 1000]}

logreg = LogisticRegression(random_state= 1)
logreg_cv = GridSearchCV(logreg, param_grid = logistic_param_grid, cv= cv)
logreg_cv.fit(X_train, y_train)

print(' --- OUTCOME ---')
print(f'Best params: {logreg_cv.best_params_}\nScore: {logreg_cv.score(X_test, y_test)}')

#GRADIENT BOOSTING GRID SEARCH

gradient_param_grid = {'loss': ['log_loss','exponential'], 
                       'learning_rate' : np.arange(0.0, 1.1, 0.1), 
                       'n_estimators' : np.arange(50, 250, 50), 
                       'criterion': ['friedman_mse', 'squared_error']}

grad = GradientBoostingClassifier(random_state= 1)
grad_cv = GridSearchCV(grad, param_grid= gradient_param_grid,cv = cv)
grad_cv.fit(X_train, y_train)

print(' --- OUTCOME ---')
print(f'Best params: {grad_cv.best_params_}\nScore: {grad_cv.score(X_test, y_test)}')

#MLP CLASSIFIER GRID SEARCH 

mlp_param_grid = {'hidden_layer_sizes': [(100,), (200,), (300,)],
                  'activation': ['relu', 'tanh', 'identity','logistic'],
                  'solver': ['adam'],
                  'alpha': [0.0001, 0.001],
                  'learning_rate': ['constant', 'adaptive']}

mlp = MLPClassifier(random_state= 1, max_iter= 2000)
mlp_cv = GridSearchCV(mlp, param_grid= mlp_param_grid, cv = cv)
mlp_cv.fit(X_train, y_train)

print(' --- OUTCOME ---')
print(f'Best params: {mlp_cv.best_params_}\nScore: {mlp_cv.score(X_test, y_test)}')

In [None]:
from sklearn.ensemble import VotingClassifier

best_logreg = LogisticRegression(C= 0.7, max_iter= 100, multi_class= 'ovr', 
                                  penalty = 'l2', solver= 'liblinear', tol = 0.1, random_state= 1)
best_grad = GradientBoostingClassifier(criterion= 'friedman_mse', learning_rate= 0.1, loss = 'log_loss', n_estimators= 50, random_state= 1)
best_mlp = MLPClassifier(activation= 'tanh', alpha = 0.0001, hidden_layer_sizes= (200,), max_iter = 2000, learning_rate= 'constant', solver = 'adam', random_state= 1)
classifier_list = [('logreg', best_logreg), 
                   ('grad', best_grad), 
                   ('mlp', best_mlp)]
voting = VotingClassifier(estimators= classifier_list, voting = 'soft')
voting.fit(X_train, y_train)
voting.score(X_test, y_test)

0.7873134328358209

In [None]:
final_voter = VotingClassifier(estimators= classifier_list, voting = 'soft')
final_voter.fit(X, y)
print(final_voter.score(X, y))
final_voter_pred = final_voter.predict(test_scaled)
final_voter_df = pd.DataFrame(final_voter_pred, index = test.index, columns= ['Survived'])
final_voter_df.to_csv('voter_classifier_predictions_v3.csv')

0.8451178451178452


import tensorflow as tf
import tensorflow.keras.layers as tfl

input_shape = (X_train.shape[1],)

model = tf.keras.Sequential([ 
    tfl.Dense(32, activation = 'relu', input_shape = input_shape), 
    tfl.Dense(16, activation = 'relu'), 
    tfl.Dense(1, activation = 'sigmoid')
])

model.compile(optimizer = 'adam', 
              loss = tf.keras.losses.BinaryCrossentropy(), 
              metrics = [tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.FalseNegatives()])
model.fit(X_train, y_train, epochs = 250, verbose = 0)
pred_nn = model.predict(X_test)
pred_nn = (pred_nn >= 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, pred_nn)
print('Accuracy NN:', accuracy_nn)

model.fit(X, y)
pred_nn = model.predict(test_scaled)
pred_nn = (pred_nn >= 0.5).astype(int)
nn_df = pd.DataFrame(pred_nn, index = test.index, columns = ['Survived'])
nn_df.to_csv('neural_network_v1.csv')