# Read in data

In [34]:
import pandas

data = pandas.read_csv("../dataset/train.csv")
print(data.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


# Fill null data entries

In [35]:
data["Age"] = data["Age"].fillna(data["Age"].mean())
data["Embarked"] = data["Embarked"].fillna("S")
print(data[["Age", "Embarked"]][:5])

   Age Embarked
0   22        S
1   38        C
2   26        S
3   35        S
4   35        S


# Convert non-numeric entries into numbers

In [36]:
data.loc[data["Sex"] == "male", "Sex"] = 0
data.loc[data["Sex"] == "female", "Sex"] = 1

data.loc[data["Embarked"] == "S", "Embarked"] = 0
data.loc[data["Embarked"] == "C", "Embarked"] = 1
data.loc[data["Embarked"] == "Q", "Embarked"] = 2

print(data[["Sex", "Embarked", "Name"]][:5])

  Sex Embarked                                               Name
0   0        0                            Braund, Mr. Owen Harris
1   1        1  Cumings, Mrs. John Bradley (Florence Briggs Th...
2   1        0                             Heikkinen, Miss. Laina
3   1        0       Futrelle, Mrs. Jacques Heath (Lily May Peel)
4   0        0                           Allen, Mr. William Henry


# Linear Regression and Cross Validation

In [38]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

alg = LinearRegression()
kf = KFold(data.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # The predictors used to train the algorihm. Only take the rows in the train fold.
    train_predictors = (data[predictors].iloc[train, :])
    # The target used to train the algorithm.
    train_target = data["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # Now make the prediction on the test fold.
    test_predictions = alg.predict(data[predictors].iloc[test, :])
    predictions.append(test_predictions)

# The predictors are in three seperate arrays within prediction[]
# We concatenate them in axis 0 as they only have one axis
predictions = np.concatenate(predictions, axis=0)

# Map the prediction outcomes
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

accuracy = sum(predictions[predictions == data["Survived"]]) / len(predictions)
print(accuracy)

0.785634118967




# Logistic Regression

In [None]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

# Initialize algorithm
alg = LogisticRegression()

# Compute the accuracy of all the cross validation folds
scores = cross_validation.cross_val_score(alg, data[predictors], data["Survived"], cv=3)

# Take the mean of the scores (because there is one for each fold)
print(scores.mean())