## Import basic modules we will use and load in the train and test set

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
# import statsmodels.api as sm

%matplotlib inline

cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "test.csv"))

## Preprocess the data. In this case, we fill all `NaN` values in the `Sex` and `Age` columns. We also create dummy variables out of the categorical columns `Pclass` and `Embarked`

In [3]:
target = "Survived"

def preprocess(df):

    df.Sex = (df.Sex == "male").astype(int)
    df.Age = df.Age.fillna(df.Age.mean())
    df.Embarked = df.Embarked.fillna(df.Embarked.mode())
    df["oneSibSp"] = (df.SibSp == 1).astype(int)
    df["hasParch"] = (df.Parch > 0).astype(int)
    df["maleAge"] = df.Sex * df.Age
    df["is20"] = df.Age >= 20

    dummy_class = pd.get_dummies(df['Pclass'], prefix='class')
    dummy_emb = pd.get_dummies(df['Embarked'], prefix='emb')
    df = pd.concat([df, dummy_class, dummy_emb], axis=1)
        
    return df

train = preprocess(train)
test = preprocess(test)

In [None]:
test.head()

### This is some `statsmodels` `LogisticRegression` code. I ended up using `sklearn` instead because of its easy to use `cross_validation` module.

In [None]:
# from statsmodels.tools.sm_exceptions import PerfectSeparationError

# # baseline_cols = ["Sex", "Age", "class_2", "class_3", "intercept"]
# baseline_cols = ["Sex", "Age", "class_2", "class_3", "intercept"]
# t = []
# for name in df.columns:
#     if name == "Survived" or name in baseline_cols:
#         continue

#     independents = baseline_cols + [name]

# #         for i in independents:
# #             print df[i].dtype

#     logit = sm.Logit(df["Survived"], df[baseline_cols + [name]], missing='drop')
#     result = logit.fit()

#     print result.prsquared
    

## Let's do some data mining. We assume that age, sex, and class are all good predictors. Let's check if any of the other variables give us any additional insight beyond these three features.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation

alg = LogisticRegression(random_state=1)

scores = []

possible_predictors = ["Sex", "Age", "is20", "Fare", "oneSibSp", "hasParch", "maleAge", "class_2", "class_3",
                       "emb_Q", "emb_S"]

baseline_cols = ["Sex", "Age", "class_2", "class_3"]
baseline_score = cross_validation.cross_val_score(alg, train[baseline_cols], train[target], cv=3).mean()

for name in possible_predictors:
    if name == "Survived" or name in baseline_cols:
        continue

    score = cross_validation.cross_val_score(alg, train[baseline_cols + [name]], train[target], cv=3).mean()
    scores.append((name, score))

    
scores = sorted(scores, key=lambda x: x[1], reverse=True)    

print "Added predictiveness beyond Age, class, and sex:"
print
for var, score in scores:
    print "%s: %0.2f%% extra accuracy on predictions" % (var, (score - baseline_score) * 100)

Added predictiveness beyond Age, class, and sex:

maleAge: 0.45% extra accuracy on predictions
emb_Q: 0.34% extra accuracy on predictions
is20: 0.22% extra accuracy on predictions
emb_S: 0.00% extra accuracy on predictions
oneSibSp: -0.11% extra accuracy on predictions
Fare: -0.45% extra accuracy on predictions
hasParch: -0.56% extra accuracy on predictions


## So adjusting for age's effects based on Sex seems to help a bit, and there might be something special about the age 20. I chose to omit all other features from the final model because they added very little benefit and I'm not convinced it was due to anything other than noise.

In [5]:
predictors = ["Sex", "Age", "class_2", "class_3", "maleAge", "is20"]
score = cross_validation.cross_val_score(alg, train[predictors], train[target], cv=3).mean(); score

0.79012345679012341

## 79% accuracy isn't bad! Let's check what the baseline accuracy is:

In [8]:
1 - train['Survived'].mean()

0.61616161616161613

## Generate a submission

In [9]:
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(train[predictors], train[target])

# Make predictions using the test set.
predictions = alg.predict(test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle.csv", index=False)