In [None]:
import os
print(os.listdir("../input"))

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV

import lightgbm as lgbm


import warnings


In [None]:

test = pd.read_csv('../input/test.csv')
train = pd.read_csv('../input/train.csv')

len_train = len(train)

# record survived and drop it to make train/test headers match
survived = train['Survived']
train = train.drop(columns=['Survived'])

# append testing onto training for ease in preprocessing
train = train.append(test, ignore_index = True, sort=False)

# extract titles from names
train['titles'] = train['Name'].str.extract('([a-zA-Z]+)\.', expand=True)
# this mapping from https://www.kaggle.com/vincentlugat/titanic-data-analysis-lgbm-0-82296
misc_titles = {'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs',
              'Major':'Misc', 'Col':'Misc', 'Dr':'Misc', 'Rev':'Misc', 'Capt':'Misc',
              'Jonkheer':'Royal', 'Sir':'Royal', 'Lady':'Royal', 'Don':'Royal', 'Countess':'Royal', 'Dona':'Royal'}
train['titles'] = train['titles'].replace(to_replace=misc_titles)
train = train.join(pd.get_dummies(train['titles']))

# make sex numerical
train['Sex'] = train['Sex'].replace(to_replace = {'male':1, 'female':0})

# look at age distributions in each social class and gender and assign any null ages
# print(train['Age'].hist(by=[train['Sex'], train['Pclass']], xlabelsize = 10, figsize=(8,10)))
train['Age'] = train.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))

# replace any nan fares with median of fare grouped by sex and pclass
train['Fare'] = train.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))

# extract letter before cabin (associated with placement on ship), put U (unassigned) for NaN's
cabins = []
for cabin in train['Cabin']:
    if str(cabin) == 'nan': cabins.append('U')
    else: cabins.append(str(cabin).strip()[0])
train['Cabin'] = cabins
train['Cabin'] = train['Cabin'].replace(to_replace = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'U':9})

# 2 values in embarked that are nan, replace them with 'S' which is the large majority of embarked
# print(train['Embarked'].value_counts())
train['Embarked'] = train['Embarked'].fillna('S')
train['Embarked'] = train['Embarked'].replace(to_replace = {'S':1, 'C':2, 'Q':3})

# get some new features
train['is_child'] = (train['Age'] < 18).astype(int)
train['is_alone'] = (train['SibSp'] + train['Parch'] == 0).astype(int)

# drop unnecessary columns
train = train.drop(columns=['Name', 'Ticket', 'titles'])

# scale number data
min_max_scaler = MinMaxScaler()
train[['Fare', 'Age']] = min_max_scaler.fit_transform(train[['Fare', 'Age']])

# nothing is null anymore
print(train.isnull().values.any())

# resplit train and test
test = train.iloc[len_train:, :]
ids = test['PassengerId']
test = test.drop(columns=['PassengerId'])

train = train.iloc[:len_train,:]
train = train.drop(columns=['PassengerId'])


In [None]:
# review general feature importance
model = RandomForestClassifier(random_state = 42)
x = train
y = survived
model.fit(x, y)

sort = [x for _,x in sorted(zip(list(model.feature_importances_),x.columns), reverse=True)]
y_vis = sorted(list(model.feature_importances_), reverse=True)
plt.bar(sort[:32], y_vis[:32], align='center', alpha=0.5)
plt.xticks(rotation=90)

In [None]:
# optimize random forest
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 80, 100],
    'max_features': [5, 10, 16],
    'min_samples_leaf': [1, 4, 7],
    'n_estimators': [10, 100, 200],
    'random_state': [16, 42, 73]
}

rf = RandomForestClassifier()

# complete grid search
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 2)
# fit and get best
grid_search.fit(x, y)
model_rf = grid_search.best_estimator_

In [None]:
# optimize logistic regression
param_grid = {
    'penalty': ['l2', 'l1'],
    'C': [0.5, 1.0, 1.5],
    'tol': [0.000001, 0.00001, 0.0001, 0.001, 0.01],
    'random_state' : [15, 33, 42, 72],
    'solver' : ['saga', 'warn'],
    'max_iter' : [50, 100, 150, 200]
}

lr = LogisticRegression()

# compelete grid search
grid_search = GridSearchCV(estimator = lr, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 2)
# fit and get best
grid_search.fit(x, y)
model_lr = grid_search.best_estimator_


In [None]:
# optimize knn classifier
param_grid = {
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'n_neighbors': [1, 3, 5, 7],
    'leaf_size': [5, 15, 30, 45],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski' ]
}

knn = KNeighborsClassifier()

# complete grid search
grid_search = GridSearchCV(estimator = knn, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 2)
# fit and get best
grid_search.fit(x, y)
model_knn = grid_search.best_estimator_


In [None]:
# stack and find accuracies of individual and overall
warnings.simplefilter('ignore')

RANDOM_SEED = 42

lgbm_clf = lgbm.LGBMClassifier(random_state=RANDOM_SEED)

sclf = StackingCVClassifier(classifiers=[model_knn, model_rf, model_lr, lgbm_clf],
                            meta_classifier=LogisticRegression(random_state=RANDOM_SEED),
                            random_state=RANDOM_SEED)

for clf, label in zip([model_knn, model_rf, model_lr, lgbm_clf, sclf], 
                      [ 
                        'KNN',
                        'Random Forest', 
                        'Logistic Regression',
                        'LGBM',
                        'StackingClassifier' ]):

    scores = model_selection.cross_val_score(clf, x, y, 
                                              cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))
    

In [None]:
# fit stacking classifier on whole data set and predict test values

sclf.fit(x, y)

df_results = pd.DataFrame()
pred = sclf.predict(test)

df_results['PassengerId'] = ids
df_results['Survived'] = pred

df_results.to_csv('titanic_submission.csv', index=False)