# Imports

In [100]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cross_validation import KFold
from sklearn import cross_validation

# Load and clean data

In [101]:
def cleanData(data, median_age):
    """
    Take in the raw data and median age from the training data
    and return a cleaned version for use with our model
    """
    
    # Replace missing ages with the median age (from the training data!)
    data.Age = data.Age.fillna(median_age)

    # Encode male as 0 and female as 1
    data.loc[data.Sex == 'male', 'Sex'] = 0
    data.loc[data.Sex == 'female', 'Sex'] = 1

    # Replace missing port of embarkation with Southampton
    # Emcode Southampton as 0, Cherbourg as 1, and Queenstown as 2
    data.Embarked = data.Embarked.fillna('S')
    data.loc[data.Embarked == 'S', 'Embarked'] = 0
    data.loc[data.Embarked == 'C', 'Embarked'] = 1
    data.loc[data.Embarked == 'Q', 'Embarked'] = 2
    
    # Replace missing fares with the median fare
    data.Fare = data.Fare.fillna(data.Fare.median())
    
    return data

In [102]:
titanic = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

titanic = cleanData(titanic, titanic.Age.median())
titanic_test = cleanData(titanic_test, titanic.Age.median())

# Use linear regression to create a model

In [103]:
# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = titanic.Survived.iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)

# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

# Calculate accuracy
accuracy = sum(titanic.Survived == predictions) / len(titanic)
print '{:.3f}% accuracy'.format(accuracy*100)

78.339% accuracy


# Use logistic regression to create a model

In [104]:
# Set predictors
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic.Survived, cv=3)
# Take the mean of the scores (because we have one for each fold)
print '{:.3f}% accuracy'.format(scores.mean()*100)

78.788% accuracy


# Apply model to test data

In [105]:
def xValidateLogModel(train, predictors):
    """
    Take in the training data and predictors to use
    and return the score of the cross validation
    """
    # Initialize the algorithm class
    alg = LogisticRegression(random_state=1)
    
    # Cross validate using training data
    scores = cross_validation.cross_val_score(alg, train[predictors], train.Survived, cv=3)
    
    # Return the mean of the 3 scores
    return scores.mean()

def testLogModel(train, test, predictors):
    """
    Take in the training data, testing data, and predictors to use
    and return the submission dataframe for a logistic regression model
    """
    # Initialize the algorithm class
    alg = LogisticRegression(random_state=1)
    
    # Train the algorithm using all the training data
    alg.fit(train[predictors], train.Survived)

    # Make predictions using the test set.
    predictions = alg.predict(test[predictors])

    # Create a new dataframe with only the columns Kaggle wants from the dataset.
    return pd.DataFrame({
        'PassengerId': test.PassengerId,
        'Survived': predictions
    })

In [106]:
# Set predictors
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Test model
submission = testLogModel(titanic, titanic_test, predictors)

# Write submission to a CSV file
submission.to_csv('submission_1.csv', index=False)

75.120% accuracy

# Second model iteration

A change I would like to make to the model is to ignore the port of embarkation.  It doesn't seem like this factor should influence a passenger's survival.  When exploring the data, I did find that there was a difference in survival rates between the ports, but I hypothesize that this has to do with the percentages of upper class, female, and child passengers from each place rather than anything about the port itself.  So, it seems odd to me to include the port of embarkation in the model.

In [107]:
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]
score = xValidateLogModel(titanic, predictors)
print '{:.3f}% accuracy'.format(score*100)

79.012% accuracy


The cross validation shows a slight improvement, so let's try it on the test data!

In [108]:
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]
submission = testLogModel(titanic, titanic_test, predictors)
submission.to_csv('submission_2.csv', index=False)

74.163% accuracy

Even though the cross validation with the training data showed an improvement, the model performed worse with the test data. I'm a bit confused as to why this may be.  Does having the port of embarkation included give more weight to the underlying factors that differentiate the ports (proportions of class, sex, age, etc.)?  If so, why doesn't the model give these factors an appropriate weighting when the port information is removed?  If not, why _is_ the port of embarkation information being useful when my intuition is saying that it should not be?

# Third model iteration

Something else seen when exploring the data is that young children tended to survive regardless of gender.  By adding a flag for young children, I hope to bring this out more.

In [109]:
titanic.loc[titanic.Age <= 8, 'IsChild'] = 1
titanic.loc[titanic.Age > 8, 'IsChild'] = 0
titanic_test.loc[titanic_test.Age <= 8, 'IsChild'] = 1
titanic_test.loc[titanic_test.Age > 8, 'IsChild'] = 0

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "IsChild"]
score = xValidateLogModel(titanic, predictors)
print '{:.3f}% accuracy'.format(score*100)

80.471% accuracy


This is showing a greater improvement than the last iteration, but let's see if it can improve with the test data.

In [110]:
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "IsChild"]
submission = testLogModel(titanic, titanic_test, predictors)
submission.to_csv('submission_3.csv', index=False)

76.555% accuracy

It did improve!  The new model was able to guess correctly for 6 more of the test passengers than in the initial iteration.  Honestly, I'm not entirely sure why this improved the model.  Perhaps a logistic regression cannot deal with a step, like the one in the survival rate of males between children and adults, and adding a feature that reflects the step helped?