# Model Iteration 2
Pratool Gadtaula

Load the Titanic data file and clean the data from the previous iteration of the model.

In [67]:
%matplotlib inline

import pandas
import numpy as np

tdf = pandas.read_csv("train.csv")

tdf.loc[tdf["Sex"] == "male", "Sex"] = 0
tdf.loc[tdf["Sex"] == "female", "Sex"] = 1

tdf["Age"] = tdf["Age"].fillna(tdf["Age"].median())

embarked_list = ["Embarked_S", "Embarked_C", "Embarked_Q"]

tdf["Embarked_S"] = np.zeros(len(tdf["Embarked"]))
tdf["Embarked_C"] = np.zeros(len(tdf["Embarked"]))
tdf["Embarked_Q"] = np.zeros(len(tdf["Embarked"]))

tdf["Embarked"] = tdf["Embarked"].fillna('S')
tdf.loc[tdf["Embarked"] == "S", "Embarked_S"] = 1
tdf.loc[tdf["Embarked"] == "C", "Embarked_C"] = 1
tdf.loc[tdf["Embarked"] == "Q", "Embarked_Q"] = 1

unique_identifiers = set([])
for el in tdf["Name"].unique():
    for name_split in el.split():
        if '.' in name_split and name_split not in unique_identifiers:
            unique_identifiers.add(name_split)
            
for identifier in unique_identifiers:
    tdf[identifier] = [ 1 if identifier in tdf["Name"][i] else 0 for i in range(len(tdf["Name"])) ]

print tdf.head(5)
print tdf.describe()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name Sex  Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris   0   22      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...   1   38      1      0   
2                             Heikkinen, Miss. Laina   1   26      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)   1   35      1      0   
4                           Allen, Mr. William Henry   0   35      0      0   

             Ticket     Fare  ...  Capt. Col.  Ms.  Mr.  Lady.  Jonkheer.  \
0         A/5 21171   7.2500  ...      0    0    0    1      0          0   
1          PC 17599  71.2833  ...      0    0    0    0      0          0   
2  STON/O2. 3101282   7.9250  ...      0    0    0    0      0          0   


Instead of using a logistic regression classifier, I'll use a random forest instead. A random forest will test out decision trees with a certain amount of randomness in the decision tree predictor in order to avoid overfitting the data.

In [68]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]
#predictors.extend(embarked_list)
predictors.extend(unique_identifiers)

# Initialize the algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends
# (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)

scores = cross_validation.cross_val_score(alg, tdf[predictors], tdf["Survived"],cv=3).mean()
print(scores)

0.827160493827


This seemed to have done slightly better, but it's hard to tell whether this is actually due to overfitting. So I'll submit it to Kaggle to check.

In [69]:
tdf_test = pandas.read_csv("test.csv")

# Clean data
tdf_test.loc[tdf_test["Sex"] == "male", "Sex"] = 0
tdf_test.loc[tdf_test["Sex"] == "female", "Sex"] = 1
tdf_test["Age"] = tdf_test["Age"].fillna(tdf_test["Age"].median())
tdf_test["Embarked_S"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked_C"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked_Q"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked"] = tdf_test["Embarked"].fillna('S')
tdf_test.loc[tdf_test["Embarked"] == "S", "Embarked_S"] = 1
tdf_test.loc[tdf_test["Embarked"] == "C", "Embarked_C"] = 1
tdf_test.loc[tdf_test["Embarked"] == "Q", "Embarked_Q"] = 1
tdf_test["Fare"] = tdf_test["Fare"].fillna(tdf_test["Fare"].median())

for identifier in unique_identifiers:
    tdf_test[identifier] = [ 1 if identifier in tdf_test["Name"][i] else 0 for i in range(len(tdf_test["Name"])) ]

alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)

# Train the algorithm using all the training data
alg.fit(tdf[predictors], tdf["Survived"])

# Make predictions using the test set.
predictions = alg.predict(tdf_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": tdf_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle.csv", index=False)

After submitting it, it received a score lower than simply using a logistic regression on the same predictors. Thus, I have a feeling that the data has been overfit or that there isn't an easy generalization to make from the decision tree (there is a large amount of variability in the input data that could most accurately predict survival). So I try making a more general column for family size (the sum of siblings, spouses, parents, and children on board) and added a new column name length that could possibly encode more data.

In [70]:
tdf["FamilySize"] = tdf["SibSp"] + tdf["Parch"]
tdf["NameLength"] = tdf["Name"].apply(lambda x: len(x))

Now I use these new columns in conjunction with all the other ones previously generated with the random forest.

In [71]:
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "FamilySize", "NameLength"]
predictors.extend(unique_identifiers)

alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)

scores = cross_validation.cross_val_score(alg, tdf[predictors], tdf["Survived"],cv=3).mean()
print(scores)

0.83164983165


This seems to yield decent results so I try it out on Kaggle.

In [72]:
tdf_test = pandas.read_csv("test.csv")

# Clean data
tdf_test.loc[tdf_test["Sex"] == "male", "Sex"] = 0
tdf_test.loc[tdf_test["Sex"] == "female", "Sex"] = 1
tdf_test["Age"] = tdf_test["Age"].fillna(tdf_test["Age"].median())
tdf_test["Embarked_S"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked_C"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked_Q"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked"] = tdf_test["Embarked"].fillna('S')
tdf_test.loc[tdf_test["Embarked"] == "S", "Embarked_S"] = 1
tdf_test.loc[tdf_test["Embarked"] == "C", "Embarked_C"] = 1
tdf_test.loc[tdf_test["Embarked"] == "Q", "Embarked_Q"] = 1
tdf_test["Fare"] = tdf_test["Fare"].fillna(tdf_test["Fare"].median())

for identifier in unique_identifiers:
    tdf_test[identifier] = [ 1 if identifier in tdf_test["Name"][i] else 0 for i in range(len(tdf_test["Name"])) ]
    
tdf_test["FamilySize"] = tdf_test["SibSp"] + tdf_test["Parch"]
tdf_test["NameLength"] = tdf_test["Name"].apply(lambda x: len(x))

alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)

# Train the algorithm using all the training data
alg.fit(tdf[predictors], tdf["Survived"])

# Make predictions using the test set.
predictions = alg.predict(tdf_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": tdf_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle.csv", index=False)

This performs even <b>worse</b> than before! Adding more columns with barely any new information is deteriorating the generality of the random forests; I'm definitely overfitting the data. Instead, I'll try a different algorithm called Gradient Boosting which builds trees one after another in succession. This is also prone to overfitting the data, so I limit it to a maximum depth of 3 and only 25 trees.

In [73]:
from sklearn.ensemble import GradientBoostingClassifier

alg = GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3)

scores = cross_validation.cross_val_score(alg, tdf[predictors], tdf["Survived"],cv=3).mean()
print(scores)

0.830527497194


This also seems hopeful, but I want to mix it with the probabilities of an algorithm that is less prone to overfitting. A linear regression model fits that criteria. For now I just want to test it with all the parameters I have so far.

In [74]:
from sklearn.linear_model import LogisticRegression

alg = LogisticRegression(random_state=1)

scores = cross_validation.cross_val_score(alg, tdf[predictors], tdf["Survived"],cv=3).mean()
print(scores)

0.821548821549


That's pretty good! It's nearly on par with the Gradient Boosting algorithm, despite some redundancies in the data. But before I mix the probabilities, I want to check out which factors seem to be most important to the linear classifier.

In [75]:
linear_predictors = ["Pclass", "Sex", "Age", "Fare", "FamilySize"]
linear_predictors.extend(unique_identifiers)

alg = LogisticRegression(random_state=1)

scores = cross_validation.cross_val_score(alg, tdf[linear_predictors], tdf["Survived"],cv=3).mean()
print(scores)

0.822671156004


It seems that avoiding the number of siblings and spouses and parents and children and name length yields almost the same prediction rate. I'll stick to these columns for the logistic regression and use all of the columns for the Gradient Boosting algorithm. However, I want a better metric for how well the algorithm is predicting the outcomes. For that I decided to make a metric (that I like to call "anti-confidence"). It takes as a function the probability of the prediction, the actual prediction, and the actual outcome. The value is then $(|r-o|+r-p)^2$, where $r$ is the prediction (0 or 1), $o$ is the outcome (0 or 1), and $p$ is the probablity ($0 \leq p \leq 1$). This penalizes for probabilities that are closer to 0.5, with a penalty that would decrease if the outcome was correct but increase a lot more with an outcome that was incorrect. I define the function below.

In [76]:
def conf_penalty(probability, prediction, outcome):
    """
    Takes the probability, prediction, and outcome of a specific prediction algorithm and
    returns a penalty that quantifies how badly the algorithm did. Higher values are worse.
    """
    return ( abs(prediction-outcome)+prediction-probability )**2

In order to easily compare the effectiveness of using different prediction methods and factors, I made some function wrappers for quick testing.

In [182]:
from sklearn.cross_validation import KFold

def test_alg(algs_predictors, df):
    """
    Automate testing of various predictive algorithms at once
    
    algs_predictors is a list of tuples that contain 1) a scikit-learn algorithm and
                    2) a list of the column names to be used as predictors
                    
    df              is a pandas data frame object that has all the column names to be used in algs_predictors
    
    This returns a tuple of a all the predictions and all the outcomes.
    
    full_test_predictions is a list of numpy arrays of the predictions on the input data with various test-train splits
    full_outcomes         is a list of numpy arrays of all the outcomes used to test the data with various test-train splits
    """
    kf = KFold(df.shape[0], n_folds=3, random_state=1)
    full_test_predictions = []
    full_outcomes = []
    for alg, predictor in algs_predictors:
        alg_predictions = []
        alg_outcomes = []
        for train, test in kf:
            train_target = df["Survived"].iloc[train]
            alg.fit(df[predictor].iloc[train,:], train_target)
            test_predictions = alg.predict_proba(df[predictor].iloc[test,:].astype(float))[:,1]
            alg_predictions.append(test_predictions)
            alg_outcomes.append(df["Survived"].iloc[test])
        alg_predictions = np.concatenate(alg_predictions)
        alg_outcomes = np.concatenate(alg_outcomes)
        full_test_predictions.append(alg_predictions)
        full_outcomes.append(alg_outcomes)
    return (full_test_predictions, full_outcomes)

def eval_tests(predictions, outcomes):
    """
    This function evaluates the effectiveness of the predictions by using the conf_penalty function as a metric
    and sums up the the metrics for each data point. This is primarily used to complement test_alg()
    
    predictions is a list of numpy arrays of the predictions made on the input data by various algorithms
    
    Output:
    outcomes is a list of values that is the sum of the metrics for each data point (the number of items
             in the list is the number of algorithms used)
    """
    pred_penalties = []
    for predicts, outs in zip(predictions, outcomes):
        pred_penalties.append( np.sum([ conf_penalty( predicts[i], int(predicts[i]>0.5), \
                                       outs[i] ) for i in range(len(outs)) ]) )
    return pred_penalties

I tested my anti-confidence rating for the different predictors on logistic regression.

In [78]:
algs_predicts = [ (LogisticRegression(random_state=1), linear_predictors),
                  (LogisticRegression(random_state=1), predictors) ]

tp, o = test_alg(algs_predicts, tdf)

print eval_tests(tp, o)

[199.21772701718064, 208.82362426371984]


Unsurprisingly, the linear predictors performed better than all the predictors for the logistic regression. I'll use an ensemble of logistic regression and the gradient boosting classifier to make new predictions.

In [79]:
def ensemble_algs(algs_predictors, df):
    kf = KFold(df.shape[0], n_folds=3, random_state=1)
    predictions = []
    for train, test in kf:
        train_target = df["Survived"].iloc[train]
        full_test_predictions = []
        # Make predictions for each algorithm on each fold
        for alg, preds in algs_predictors:
            # Fit the algorithm on the training data.
            alg.fit(df[preds].iloc[train,:], train_target)
            # Select and predict on the test fold.  
            # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
            test_predictions = alg.predict_proba(df[preds].iloc[test,:].astype(float))[:,1]
            full_test_predictions.append(test_predictions)
        # Use a simple ensembling scheme -- just average the predictions to get the final classification.
        test_predictions = np.sum(full_test_predictions, axis=0) / len(full_test_predictions)
        # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.
        test_predictions[test_predictions <= .5] = 0
        test_predictions[test_predictions > .5] = 1
        predictions.append(test_predictions)
    predictions = np.concatenate(predictions, axis=0)
    return predictions

In [80]:
algs_predicts = [(LogisticRegression(random_state=1), linear_predictors),
                  (GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors) ]
print eval_tests([ensemble_algs(algs_predicts, tdf)], [tdf["Survived"]])

[154.0]


The ensemble performed much better than just the logistic regression. I'll use this method and submit it to Kaggle!

In [81]:
tdf_test = pandas.read_csv("test.csv")

# Clean data
tdf_test.loc[tdf_test["Sex"] == "male", "Sex"] = 0
tdf_test.loc[tdf_test["Sex"] == "female", "Sex"] = 1
tdf_test["Age"] = tdf_test["Age"].fillna(tdf_test["Age"].median())
tdf_test["Embarked_S"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked_C"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked_Q"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked"] = tdf_test["Embarked"].fillna('S')
tdf_test.loc[tdf_test["Embarked"] == "S", "Embarked_S"] = 1
tdf_test.loc[tdf_test["Embarked"] == "C", "Embarked_C"] = 1
tdf_test.loc[tdf_test["Embarked"] == "Q", "Embarked_Q"] = 1
tdf_test["Fare"] = tdf_test["Fare"].fillna(tdf_test["Fare"].median())

# Add new columns
for identifier in unique_identifiers:
    tdf_test[identifier] = [ 1 if identifier in tdf_test["Name"][i] else 0 for i in range(len(tdf_test["Name"])) ]
    
tdf_test["FamilySize"] = tdf_test["SibSp"] + tdf_test["Parch"]
tdf_test["NameLength"] = tdf_test["Name"].apply(lambda x: len(x))

# Select prediction identifiers
linear_predictors = ["Pclass", "Sex", "Age", "Fare", "FamilySize"]
linear_predictors.extend(unique_identifiers)

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "FamilySize", "NameLength"]
predictors.extend(unique_identifiers)

# Arrange into an algorithm ensemble
algs_predicts = [(LogisticRegression(random_state=1), linear_predictors),
                  (GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors) ]

full_predictions = []
for alg, preds in algs_predicts:
    # Fit the algorithm using the full training data.
    alg.fit(tdf[preds], tdf["Survived"])
    # Predict using the test dataset.  We have to convert all the columns to floats to avoid an error.
    predictions = alg.predict_proba(tdf_test[preds].astype(float))[:,1]
    full_predictions.append(predictions)

test_predictions = np.sum(full_predictions, axis=0) / len(full_predictions)
predictions = (predictions > 0.5).astype(int)

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": tdf_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle.csv", index=False)

This entry had the same performace as my previous best entry, with an accuracy of 0.78947. Now I'll try weighting the gradient boosting classifier more because it did perform better on the test data set and submit it to Kaggle.

In [82]:
tdf_test = pandas.read_csv("test.csv")

# Clean data
tdf_test.loc[tdf_test["Sex"] == "male", "Sex"] = 0
tdf_test.loc[tdf_test["Sex"] == "female", "Sex"] = 1
tdf_test["Age"] = tdf_test["Age"].fillna(tdf_test["Age"].median())
tdf_test["Embarked_S"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked_C"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked_Q"] = np.zeros(len(tdf_test["Embarked"]))
tdf_test["Embarked"] = tdf_test["Embarked"].fillna('S')
tdf_test.loc[tdf_test["Embarked"] == "S", "Embarked_S"] = 1
tdf_test.loc[tdf_test["Embarked"] == "C", "Embarked_C"] = 1
tdf_test.loc[tdf_test["Embarked"] == "Q", "Embarked_Q"] = 1
tdf_test["Fare"] = tdf_test["Fare"].fillna(tdf_test["Fare"].median())

# Add new columns
for identifier in unique_identifiers:
    tdf_test[identifier] = [ 1 if identifier in tdf_test["Name"][i] else 0 for i in range(len(tdf_test["Name"])) ]
    
tdf_test["FamilySize"] = tdf_test["SibSp"] + tdf_test["Parch"]
tdf_test["NameLength"] = tdf_test["Name"].apply(lambda x: len(x))

# Select prediction identifiers
linear_predictors = ["Pclass", "Sex", "Age", "Fare", "FamilySize"]
linear_predictors.extend(unique_identifiers)

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "FamilySize", "NameLength"]
predictors.extend(unique_identifiers)

# Arrange into an algorithm ensemble
algs_predicts = [(LogisticRegression(random_state=1), linear_predictors),
                  (GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors) ]

full_predictions = []
for alg, preds in algs_predicts:
    # Fit the algorithm using the full training data.
    alg.fit(tdf[preds], tdf["Survived"])
    # Predict using the test dataset.  We have to convert all the columns to floats to avoid an error.
    predictions = alg.predict_proba(tdf_test[preds].astype(float))[:,1]
    full_predictions.append(predictions)

test_predictions = (full_predictions[0]*3 + full_predictions[1]) / 4
predictions = (predictions > 0.5).astype(int)

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": tdf_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle.csv", index=False)

This entry also had the same performace as my previous best entry, with an accuracy of 0.78947.

After reading <a href="http://elenacuoco.altervista.org/blog/archives/1195">this</a> blog, I realized that not calculating the age correctly could have huge consequences on the prediction; it's one of the most important factors in deciding survival. Instead of filling all the ages with the median, I'll fill them according to their title. There are quite a few titles that could be collapsed into four general ones, Mr, Mrs, Miss, and Master. Master is not a colloquially spoken today, but back at the time of the titanic referred to boys who were not deemed to be "men" by society yet. There was a large portion of the passengers with each of these titles, so it seems like a better way to imput the ages that were not given.

In [179]:
tdf = pandas.read_csv("train.csv")

def clean_data(df):
    df.loc[df["Sex"] == "male", "Sex"] = 0
    df.loc[df["Sex"] == "female", "Sex"] = 1

    df["FamilySize"] = df["SibSp"] + df["Parch"]
    df["NameLength"] = df["Name"].apply(lambda x: len(x))

    embarked_list = ["Embarked_S", "Embarked_C", "Embarked_Q"]

    df["Embarked_S"] = np.zeros(len(df["Embarked"]))
    df["Embarked_C"] = np.zeros(len(df["Embarked"]))
    df["Embarked_Q"] = np.zeros(len(df["Embarked"]))

    df["Embarked"] = df["Embarked"].fillna('S')
    df.loc[df["Embarked"] == "S", "Embarked_S"] = 1
    df.loc[df["Embarked"] == "C", "Embarked_C"] = 1
    df.loc[df["Embarked"] == "Q", "Embarked_Q"] = 1
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())

unique_identifiers = set([])
for el in tdf["Name"].unique():
    for name_split in el.split():
        if '.' in name_split and name_split not in unique_identifiers and not name_split == 'L.':
            unique_identifiers.add(name_split)

def make_identifiers(df):
    for identifier in unique_identifiers:
        df[identifier] = df["Name"].apply(lambda x: 1 if identifier in x else 0)

clean_data(tdf)
make_identifiers(tdf)

def classify_age_title(df):
    age_titles = {}
    age_titles["Mr"] = ["Mr.", "Rev.", "Sir.", "Col.", "Capt.", "Major.", "Don.", "Jonkheer.", "Dr."]
    age_titles["Mrs"] = ["Mrs.", "Mme.", "Lady.", "Countess."]
    age_titles["Miss"] = ["Miss.", "Mlle.", "Ms."]
    age_titles["Master"] = ["Master."]

    for category in age_titles:
        temp_category_vals = df[age_titles[category][0]].apply(lambda x: x)
        for title in age_titles[category]:
            temp_category_vals += df[title].apply(lambda x: x)
        df[category] = temp_category_vals

        # Make the category boolean
        df.loc[df[category] > 0, category] = 1

        # Determine the mean of the category with the values that exist
        cur_mean = df["Age"][df[category] == 1].dropna()
        cur_mean = cur_mean.mean()

        # Set the age values in the category that do not have values with current mean
        df.loc[( (df[category] == 1) & (np.isnan(df["Age"])) ), "Age"] = cur_mean
    
    cur_median = df["Age"].dropna()
    cur_median = cur_median.median()
    df.loc[(np.isnan(df["Age"])), "Age"] = cur_median
    
classify_age_title(tdf)

In [180]:
# Select prediction identifiers
linear_predictors = ["Pclass", "Sex", "Age", "Fare", "FamilySize"]
linear_predictors.extend(unique_identifiers)

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "FamilySize", "NameLength"]
predictors.extend(unique_identifiers)

algs_predicts = [(LogisticRegression(random_state=1), linear_predictors),
                  (GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors) ]
print eval_tests([ensemble_algs(algs_predicts, tdf)], [tdf["Survived"]])

[154.0]


This did not seem to have much of an effect on the training data set. However, I think that misclassifying due to putting the wrong age from just using the median could make a difference. I'll go ahead and submit it to Kaggle anyway because I do think that I can make more informed decisions.

In [181]:
tdf_test = pandas.read_csv("test.csv")

clean_data(tdf_test)
make_identifiers(tdf_test)
classify_age_title(tdf_test)

# Arrange into an algorithm ensemble
algs_predicts = [(LogisticRegression(random_state=1), linear_predictors),
                  (GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors) ]

full_predictions = []
for alg, preds in algs_predicts:
    # Fit the algorithm using the full training data.
    alg.fit(tdf[preds], tdf["Survived"])
    # Predict using the test dataset.  We have to convert all the columns to floats to avoid an error.
    predictions = alg.predict_proba(tdf_test[preds].astype(float))[:,1]
    full_predictions.append(predictions)

test_predictions = (full_predictions[0]*3 + full_predictions[1]) / 4
predictions = (predictions > 0.5).astype(int)

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": tdf_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle.csv", index=False)

Sure enough my score increased by 0.01435! Improving the imputation of the age and including the titles garnered the greatest improvements to the predictions. This is likely because they are more strongly correlated to survival rate than other factors such as fare.