My model uses inspiration from the DataQuest modules, except that I got rid of family id as a predictor because I didn't think it was useful.

Import useful libraries:

In [69]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import thinkstats2 as ts
import thinkplot as tp
import pylab as P
import math
import re
import operator

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline

Setup the preprocessing of the raw dataframe:

In [70]:
def getParchAdj(n):
    '''Groups people with >=2 parch together'''
    if pd.isnull(n):
        return n
    if n == 0:
        return 0
    elif n == 1:
        return 1
    elif n > 1:
        return 2
    
def getLog(x):
    return math.log(x)

def get_title(name):
    # Use a regular expression to search for a title.  Titles always consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

# # A function to get the id given a row
# def get_family_id(row):
#     # Find the last name by splitting on a comma
#     last_name = row["Name"].split(",")[0]
#     # Create the family id
#     family_id = "{0}{1}".format(last_name, row["FamilySize"])
#     # Look up the id in the mapping
#     if family_id not in family_id_mapping:
#         if len(family_id_mapping) == 0:
#             current_id = 1
#         else:
#             # Get the maximum id from the mapping and add one to it if we don't have an id
#             current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
#         family_id_mapping[family_id] = current_id
#     return family_id_mapping[family_id]
    
df = pd.read_csv('train.csv', header=0)

def preprocess(dframe):
    
    frame = dframe.copy()
    
    # Fill NaN's with the medians
    frame["Age"] = frame["Age"].fillna(df["Age"].median())
    frame["Fare"] = frame["Fare"].fillna(df["Fare"].median())
    frame["Embarked"] = frame["Embarked"].fillna("S")
    
    # Numberize sex
    frame.loc[frame["Sex"] == "male", "Sex"] = 0 
    frame.loc[frame["Sex"] == "female", "Sex"] = 1
    
    # Numberize Embarked
    frame.loc[frame["Embarked"] == "S", "Embarked"] = 0
    frame.loc[frame["Embarked"] == "C", "Embarked"] = 1
    frame.loc[frame["Embarked"] == "Q", "Embarked"] = 2
    
    # Adjust the parch value
    frame['Parchadj'] = frame.Parch.apply(getParchAdj)
    
    # get the log of Age
    frame['AgeLog'] = frame.Age.apply(math.log)
    
    # Make a familysize column
    frame['FamilySize'] = frame['SibSp'] + frame['Parch']
    
    frame["NameLength"] = frame["Name"].apply(lambda x: len(x))
    
    # Get all the titles
    titles = frame["Name"].apply(get_title)

    # Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7,
                     "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2, "Dona":10}
    for k,v in title_mapping.items():
        titles[titles == k] = v

    # Add in the title column.
    frame["Title"] = titles
    
    return frame
    

df = preprocess(df)

# The columns we'll use to predict the target
# predictors = ["Pclass", "Sex"]
predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title"]

I originally used "Parchadj" to categorize the Parch column.  It seemed like any number of parents and children above 2 had the same probability to survive as 2, so I grouped them all into a category of "at least 2".  I abandoned this column in favor of DataQuest's "FamilySize" column, which performed better and takes into account siblings as well.

Originally, taking the natural logarithm of age and using that instead of actual age improved my scores.  I abandoned that approach because it did not improve my scores using the forest algorithms.

Test the accuracy of our model using Linear Regression and Random Forest Classifier:

In [71]:
# Initialize our algorithm class
alg = LinearRegression()
# alg = RandomForestClassifier()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(df.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test1 in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (df[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = df["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(df[predictors].iloc[test1,:])
    predictions.append(test_predictions)
    
# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0
accuracy = sum(predictions[predictions == df["Survived"]]) / len(predictions)
print accuracy
print 'accuracy improvement:', (accuracy - 0.796857463524)
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

# Initialize our algorithm
# alg = LogisticRegression(random_state=1)
alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4)

# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, df[predictors], df["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print scores.mean()
print 'cv score improvement:', (scores.mean() - 0.796857463524)

0.79797979798
accuracy improvement: 0.0011223344558
0.838383838384
cv score improvement: 0.0415263748598




I used comparison with a previous score to benchmark how well a new model would do compared to a previous model.

Test the accuracy of our model using Gradient Boosting Classifier and Logistic Regression:

In [72]:
# The algorithms we want to ensemble.
# We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier.
# predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title"]

algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],
    [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]

# Initialize the cross validation folds
kf = KFold(df.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    train_target = df["Survived"].iloc[train]
    full_test_predictions = []
    # Make predictions for each algorithm on each fold
    for alg, predictors in algorithms:
        # Fit the algorithm on the training data.
        alg.fit(df[predictors].iloc[train,:], train_target)
        # Select and predict on the test fold.  
        # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
        test_predictions = alg.predict_proba(df[predictors].iloc[test,:].astype(float))[:,1]
        full_test_predictions.append(test_predictions)
    # Use a simple ensembling scheme -- just average the predictions to get the final classification.
    test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
    # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.
    test_predictions[test_predictions <= .5] = 0
    test_predictions[test_predictions > .5] = 1
    predictions.append(test_predictions)

# Put all the predictions together into one array.
predictions = np.concatenate(predictions, axis=0)

# Compute accuracy by comparing to the training data.
accuracy = sum(predictions[predictions == df["Survived"]]) / len(predictions)
print accuracy

0.821548821549




Make the kaggle submission csv:

In [73]:
testf = pd.read_csv('test.csv', header=0)
testf = preprocess(testf)

full_predictions = []
for alg, predictors in algorithms:
    # Fit the algorithm using the full training data.
    alg.fit(df[predictors], df["Survived"])
    # Predict using the test dataset.  We have to convert all the columns to floats to avoid an error.
    predictions = alg.predict_proba(testf[predictors].astype(float))[:,1]
    full_predictions.append(predictions)

# The gradient boosting classifier generates better predictions, so we weight it higher.
predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4

predictions[predictions <= .5] = 0
predictions[predictions > .5] = 1
predictions = predictions.astype(int)

submission = pd.DataFrame({
    "PassengerId": testf["PassengerId"],
    "Survived": predictions
})

submission.to_csv("kaggle.csv", index=False)

This model got a kaggle score of 0.79426.  This was slightly less than I was hoping, as I predicted an accuracy of 0.8215, but still a decent number.  In the future, I could look into categorizing different columns, such as fare (maybe all fares above $100 have the same survival rate?).