In [48]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
import matplotlib.pyplot as ply
import seaborn as sns
%matplotlib inline

training_set = pd.read_csv("train.csv")
test_set = pd.read_csv("test.csv")
training_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [49]:
# Store labels
train_titanic_labels = training_set['Survived']

In [50]:
# Explore the titles generated
def them_titles(datain):
    names = datain['Name'].copy().str.strip().str.split(',')
    titles = names.str.get(1).str.strip().str.split('.').str.get(0)
    return titles

In [51]:
# This class will create a dataset containing all the features, and generate new ones. 
class OptimusPrime(BaseEstimator, TransformerMixin):
    def __init__(self):
        self = self
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # Generate the titles and merge with.
        X['Title'] = them_titles(X)
    
        # Flag nobility. I don't think 'Mlle' denotes mobility, but I'll place them here.
        conditions = ['Dona', 'Mme','Mlle', 'Lady','the Countess','Don','Jonkheer']
        X['Noble'] = np.isin(X['Title'],conditions)
        X['Noble'] = np.where(X['Noble']==True,1,0)

        # Find all the reverends. All of them were male, and they all died.
        X['Reverend'] = np.where(X['Title'] == 'Rev',1,0)

        # Identify all military personel.
        conditions = ['Col','Major']
        X['Military'] = np.isin(X['Title'], conditions)
        X['Military'] = np.where(X['Military']==True,1,0)

        # Flag all the doctors.
        X['Doctor'] = np.where(X['Title'] == 'Dr',1,0)

        # I think this is noise. I'll probably delete it.
        X['Captain'] = np.where(X['Title'] == 'Capt',1,0)
    
        # Parse the tickets.
        alpha_prefix = X['Ticket'].str.strip().str.split(' ').str.get(0)
        alpha_num = X['Ticket'].str.strip().str.split(' ').str.get(1)
        #X['Alpha_ticket'] = np.where(X['Ticket'].str.lower().str.isdigit()==False,X['Ticket'],np.NaN)
        #X['Alpha_ticket_prefix'] = np.where(X['Ticket'].str.lower().str.isdigit()==False,alpha_prefix,np.NaN)
        #X['Alpha_ticket_num'] = np.where(X['Ticket'].str.lower().str.isdigit()==False,alpha_num,np.NaN)
        #X['Numeric_ticket'] = np.where(X['Ticket'].str.lower().str.isdigit()==True,X['Ticket'],np.NaN)
        X['Ticket_number_length'] = X['Ticket'].str.extract('(\d+$)')
        X['Ticket_number_length'] = np.where(X['Ticket_number_length'].isnull() == True, '0',X['Ticket_number_length'])
        X['Ticket_number_length'] = X['Ticket_number_length'].str.len().astype(str)
        return X

In [52]:
# Create the full training set with generated features.
autobots = OptimusPrime()
training_set_prelim = autobots.transform(training_set)

In [53]:
# Get the categorical data ready for encoding.
processing = training_set_prelim.copy()

drop_var = ['Name', 'Ticket','Cabin','Embarked','Title','Captain']
processing = processing.drop(drop_var, axis=1)

category_features = processing.select_dtypes(include='object')

numeric_features = processing.select_dtypes(exclude='object')

In [54]:
# Turn the pandas dataframe into numpy arrays. You have to choose which features are converted.
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

In [55]:
# Create a class to one-hot encode the categorical features using pandas.
class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = pd.get_dummies(data=X, columns=self.attribute_names)
        return X

In [56]:
# Features to encode.
cat_attribs = ['Ticket_number_length','Pclass','Sex']

In [57]:
# Numeric features to convert into numpy arrays.
num_attribs = ['Age','Fare','SibSp','Parch']

# Categorical features to convert to numpy arrays.
cat_attribs_sel = ['Noble','Reverend','Military','Doctor','Ticket_number_length_1','Ticket_number_length_3',
                   'Ticket_number_length_4','Ticket_number_length_5','Ticket_number_length_6','Ticket_number_length_7',
                   'Pclass_1','Pclass_2','Pclass_3','Sex_male','Sex_female']

In [58]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy='median')),
    ('minmax_scaler', MinMaxScaler()),
])

cat_pipeline = Pipeline([
    ('autobots', OptimusPrime()),
    ('cat_encoder', OneHotEncoder(cat_attribs)),
    ('selector', DataFrameSelector(cat_attribs_sel)),
])



In [59]:
# Create final training set
full_pipeline = FeatureUnion([
    ('num_pipe', num_pipeline),
    ('cat_pipe', cat_pipeline),
])

titanic_tr_cleaned= full_pipeline.fit_transform(training_set.copy())

In [60]:
test_data = full_pipeline.transform(test_set)

In [61]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import regularizers
import tensorflow as tf

# Config to turn on JIT compilation
config = tf.ConfigProto()
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
tf.contrib.data.prefetch_to_device('device')

# Build the model
def ML_Model():
    model = Sequential()
    model.add(Dense(128, input_dim=19, activation='relu')) 
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics = ['accuracy'])
    return model

In [62]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# Builds/compiles the NN model, but does NOT fit 
nn_estimator = KerasClassifier(build_fn=ML_Model, epochs=200, batch_size=128, verbose=0)

# Fit the NN model you just built using the KerasClassifier API
nn_estimator.fit(titanic_tr_cleaned, train_titanic_labels)

# Generate stratified cv sets
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

# Cross validate and score the model against the cv set
results = cross_val_score(nn_estimator, titanic_tr_cleaned, train_titanic_labels, cv=kfold)

print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 81.48% (2.65%)


In [21]:
# Create a function that will create dataframe for submission
# Note: This function assumes that you will be using the Scikit prediction method
def kaggle_submission(estimator,datain):
    predictions = estimator.predict(datain)
    predictions = pd.DataFrame(predictions, columns=['Survived'])
    submission = pd.DataFrame(test_set['PassengerId'])
    submission['Survived'] = predictions['Survived']
    return submission

In [22]:
nn_predictions = kaggle_submission(nn_estimator, test_data)
nn_predictions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [23]:
nn_predictions.to_csv('submition.csv', index=False)

In [27]:
from sklearn.svm import SVC

clf = SVC(C=3, kernel='rbf')
clf.fit(titanic_tr_cleaned, train_titanic_labels)
results = cross_val_score(clf, titanic_tr_cleaned, train_titanic_labels)
score = clf.score(titanic_tr_cleaned, train_titanic_labels)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 79.01% (1.36%)




In [18]:
svm_predictions = kaggle_submission(clf, titanic_test)
svm_predictions.to_csv('submition.csv', index=False)

In [28]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)

rf_clf.fit(titanic_tr_cleaned, train_titanic_labels)

# Generate stratified cv sets
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

# Cross validate and score the model against the cv set
results = cross_val_score(rf_clf, titanic_tr_cleaned, train_titanic_labels, cv=kfold)

print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 81.03% (1.93%)


In [641]:
titanic_tr_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 26 columns):
Age                            891 non-null float64
SibSp                          891 non-null float64
Parch                          891 non-null float64
Fare                           891 non-null float64
Noble                          891 non-null float64
Reverend                       891 non-null float64
Military                       891 non-null float64
Doctor                         891 non-null float64
Pclass_1                       891 non-null int32
Pclass_2                       891 non-null int32
Pclass_3                       891 non-null int32
FamilySize                     891 non-null float64
Ticket_alpha_num_length_1.0    891 non-null uint8
Ticket_alpha_num_length_2.0    891 non-null uint8
Ticket_alpha_num_length_3.0    891 non-null uint8
Ticket_alpha_num_length_4.0    891 non-null uint8
Ticket_alpha_num_length_5.0    891 non-null uint8
Ticket_alpha_num_length_6.0

In [29]:
rf_clf.feature_importances_

array([0.23667004, 0.23662801, 0.04724375, 0.03887668, 0.00117939,
       0.00093096, 0.00357035, 0.00173428, 0.00281199, 0.00500092,
       0.01286642, 0.02222884, 0.01161115, 0.00523915, 0.02288152,
       0.0133678 , 0.04943343, 0.17344348, 0.11428183])

In [33]:
rf_predictions = kaggle_submission(rf_clf, test_data)
rf_predictions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


In [23]:
rf_predictions.to_csv('submition.csv', index=False)

In [34]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    RandomForestClassifier(n_estimators=250, class_weight='balanced', n_jobs=-1),n_estimators=150)

ada_clf.fit(titanic_tr_cleaned, train_titanic_labels)

# Generate stratified cv sets
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

# Cross validate and score the model against the cv set
results = cross_val_score(ada_clf, titanic_tr_cleaned, train_titanic_labels, cv=kfold)

print("Results: {0:.2f}% ({0:.2f}%)".format(results.mean()*100, results.std()*100))

Results: 78.79% (1.80%)


In [72]:
from sklearn.model_selection import GridSearchCV

parameters = {"n_estimators": [280,285,287]}

Grid_RF_clf = RandomForestClassifier(bootstrap=True, criterion="gini", n_jobs=-1)

Grid_RF_clf.fit(titanic_tr_cleaned, train_titanic_labels)

Grid_RF_clf_CV = GridSearchCV(Grid_RF_clf, parameters,n_jobs=-1)
Grid_RF_clf_CV.fit(titanic_tr_cleaned , train_titanic_labels)

# Generate stratified cv sets
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

# Cross validate and score the model against the cv set
results = cross_val_score(Grid_RF_clf_CV, titanic_tr_cleaned, train_titanic_labels, cv=kfold)

print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 81.37% (3.09%)


In [26]:
Grid_RF_clf_CV.best_estimator_

print(Grid_RF_clf_CV.best_score_)

0.7991021324354658


In [27]:
parameters = {"n_estimators": [30,35,40,45,55,60,65,100,250,300],"learning_rate":[1,.1,.01,.001,.0001]}

ada_clf_GSCV = AdaBoostClassifier(
    RandomForestClassifier(bootstrap=True,random_state=seed,class_weight="balanced_subsample"))

ada_clf_GSCV.fit(titanic_tr_cleaned , train_titanic_labels)

ada_GSCV = GridSearchCV(ada_clf_GSCV, param_grid=parameters,n_jobs=-1)

ada_GSCV.fit(titanic_tr_cleaned , train_titanic_labels)

GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity...      verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [30, 35, 40, 45, 55, 60, 65, 100, 250, 300], 'learning_rate': [1, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [28]:
print(ada_GSCV.best_score_)

0.8069584736251403


In [29]:
ada_GSCV_best = ada_GSCV.best_estimator_.fit(titanic_tr_cleaned, train_titanic_labels)
ada_GSCV_best_predictions = kaggle_submission(ada_GSCV_best, titanic_test)
ada_GSCV_best_predictions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [30]:
ada_GSCV_best_predictions.to_csv(r'C:\Anaconda\envs\tensorflow\Kaggle\Titanic\output\submission.csv', index=False)

In [31]:
from sklearn.linear_model import LogisticRegressionCV

lg_clf = LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1)
lg_clf.fit(titanic_tr_cleaned, train_titanic_labels)

# Cross validate and score the model against the cv set
results = cross_val_score(lg_clf, titanic_tr_cleaned, train_titanic_labels, cv=kfold)

print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 83.28% (2.48%)


In [32]:
lg_clf_predictions = kaggle_submission(lg_clf, titanic_test)
lg_clf_predictions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [33]:
lg_clf_predictions.to_csv(r'C:\Anaconda\envs\tensorflow\Kaggle\Titanic\output\submission.csv', index=False)