In [243]:
import pandas as pd
import numpy as np

In [244]:
from operator import itemgetter
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [245]:
raw_train_data = pd.read_csv("train.csv")
raw_test_data = pd.read_csv("test.csv")
raw_data = raw_train_data.append(raw_test_data)
#Too many missing values from these features
raw_data.drop(["Ticket", "Cabin"], axis=1, inplace=True)

In [246]:
#Extract the titles from the names of passengers as a feature
import re
names = raw_data["Name"].apply(lambda x : re.split('[,.]',x))
surnames = names.apply(lambda x : x[0].strip()).values
titles = names.apply(lambda x : x[1].strip()).values
raw_data["Title"] = titles

#Group similar titles together
#Miss and Master are Child
#Rev, Dr, Col, Major, Capt, Don, Jonkheer, Sir are Sir
#Ms, Mlle, Mme, the Countess, Lady, Dona are Lady

raw_data.loc[raw_data["Title"].apply(lambda x : x in ["Capt", "Don", "Sir", "Major", "Rev", "Dr", "Col", "Jonkheer"]), "Title"] = "Sir"
raw_data.loc[raw_data["Title"].apply(lambda x : x in ["Lady", "the Countess", "Dona", "Mlle", "Mme", "Ms"]), "Title"] = "Lady"
raw_data.loc[raw_data["Title"].apply(lambda x : x in ["Master", "Miss"]), "Title"] = "Child"

In [247]:
#Impute an unknown age by taking the median age of passengers with the same title
raw_data.loc[(raw_data["Age"].isnull() == True) & (raw_data["Title"] == "Lady"), "Age"] = raw_data.loc[raw_data["Title"] == "Lady", "Age"].median() 
raw_data.loc[(raw_data["Age"].isnull() == True) & (raw_data["Title"] == "Mr"), "Age"] = raw_data.loc[raw_data["Title"] == "Mr", "Age"].median()
raw_data.loc[(raw_data["Age"].isnull() == True) & (raw_data["Title"] == "Child"), "Age"] = raw_data.loc[raw_data["Title"] == "Child", "Age"].median()
raw_data.loc[(raw_data["Age"].isnull() == True) & (raw_data["Title"] == "Mrs"), "Age"] = raw_data.loc[raw_data["Title"] == "Mrs", "Age"].median()
raw_data.loc[(raw_data["Age"].isnull() == True) & (raw_data["Title"] == "Sir"), "Age"] = raw_data.loc[raw_data["Title"] == "Sir", "Age"].median()

In [248]:
#Impute an unknown embarkment point by the taking the most common embarkment point for passengers with the same class
raw_data.loc[(raw_data["Embarked"].isnull() == True), "Embarked"] = raw_data.loc[raw_data["Pclass"] == 1,  "Embarked"].value_counts().index[1]

In [249]:
#Impute an unknown fare by the taking the median fare for passengers with the same class
raw_data.loc[(raw_data["Fare"].isnull() == True), "Fare"] = raw_data.loc[raw_data["Pclass"] == 3,  "Fare"].value_counts().index[1]

In [250]:
#Create a family size feature using Sibilings and Parents column
raw_data["FamilySize"] = raw_data["SibSp"] + raw_data["Parch"] + 1

In [251]:
#Create a wealth feature using the passenger's fare price and age
raw_data["Wealth"] = raw_data["Fare"] * raw_data["Age"]

In [252]:
#Create a family feature which groups families together (passengers with the same surname and same family size)
raw_data["Family"] = surnames + raw_data["FamilySize"].apply(str)
raw_data.loc[raw_data["FamilySize"] < 3, "Family"] = "Small"

In [253]:
raw_data["Family"].value_counts().index.values

array(['Small', 'Sage11', 'Andersson7', 'Goodwin8', 'Asplund7', 'Panula6',
       'Rice6', 'Skoog6', 'Fortune6', 'Lefebre5', 'Davies3', 'Palsson5',
       'Ford5', 'Ryerson5', 'Baclini4', 'Carter4', 'Becker4', 'Brown3',
       'Dean4', 'Allison4', 'Herman4', 'Johnston4', 'West4', 'Laroche4',
       'Boulos3', 'Coutts3', 'Mallet3', 'Elias3', 'Peter3', 'McCoy3',
       'Hart3', 'Thayer3', 'Abbott3', 'Peacock3', 'Goldsmith3', 'Samaan3',
       'Sandstrom3', 'Quick3', 'Nakid3', 'Compton3', 'Taussig3',
       'Moubarek3', 'Bourke3', 'Van Impe3', 'Navratil3', 'Klasen3',
       'Dodge3', 'Crosby3', 'Danbom3', 'Widener3', 'Wells3', 'Wick3',
       'Spedden3', 'Touma3', 'van Billiard3', 'Caldwell3', 'Johnson3',
       'Collyer3', 'Drew3', 'Hickman3', 'Rosblom3', 'Jefferys3',
       'Lahtinen3', 'Frolicher-Stehli3', 'Hocking4', 'Richards3',
       'Christy3', 'Beckwith3', 'Gustafsson3', 'Vander Planke3', 'Kink3',
       'Hays3', 'Kink-Heilmann3', 'Hamalainen3', 'Hirvonen3', 'Frolicher3',
       

In [254]:
raw_data.loc[raw_data["Family"] == "Lefebre5", "Survived"].value_counts(dropna=False)

 0     4
NaN    1
Name: Survived, dtype: int64

In [255]:
raw_data.loc[raw_data["Family"] == "Lefebre5"]

Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Title,FamilySize,Wealth,Family
176,18.0,S,25.4667,"Lefebre, Master. Henry Forbes",1,177,3,male,3,0.0,Child,5,458.4006,Lefebre5
229,18.0,S,25.4667,"Lefebre, Miss. Mathilde",1,230,3,female,3,0.0,Child,5,458.4006,Lefebre5
409,18.0,S,25.4667,"Lefebre, Miss. Ida",1,410,3,female,3,0.0,Child,5,458.4006,Lefebre5
485,18.0,S,25.4667,"Lefebre, Miss. Jeannie",1,486,3,female,3,0.0,Child,5,458.4006,Lefebre5
132,35.5,S,25.4667,"Lefebre, Mrs. Frank (Frances)",4,1024,3,female,0,,Mrs,5,904.06785,Lefebre5


In [256]:
#Create a feature indicating whether the majority of the family survived, died, or unknown
#FamilyStatus_survived
#FamilyStatus_perished
#FamilyStatus_unknown
for family in raw_data["Family"].value_counts().index.values:
    if family != "Small":
        if raw_data.loc[raw_data["Family"] == family, "Survived"].value_counts(dropna=False).index[0] == 1:
            raw_data.loc[raw_data["Family"] == family, "FamilyStatus"] = "survived"
        elif raw_data.loc[raw_data["Family"] == family, "Survived"].value_counts(dropna=False).index[0] == 0: 
            raw_data.loc[raw_data["Family"] == family, "FamilyStatus"] = "perished"
        else:
            raw_data.loc[raw_data["Family"] == family, "FamilyStatus"] = "unknown"
    else:
        raw_data.loc[raw_data["Family"] == family, "FamilyStatus"] = "unknown"

In [257]:
#One hot encoting for categorical variables 
raw_data_onehot = pd.get_dummies(raw_data,columns=["Sex", "Pclass", "Title", "Embarked", "FamilyStatus"])

print raw_data_onehot.isnull().sum()

raw_data_onehot.head()

Age                        0
Fare                       0
Name                       0
Parch                      0
PassengerId                0
SibSp                      0
Survived                 418
FamilySize                 0
Wealth                     0
Family                     0
Sex_female                 0
Sex_male                   0
Pclass_1                   0
Pclass_2                   0
Pclass_3                   0
Title_Child                0
Title_Lady                 0
Title_Mr                   0
Title_Mrs                  0
Title_Sir                  0
Embarked_C                 0
Embarked_Q                 0
Embarked_S                 0
FamilyStatus_perished      0
FamilyStatus_survived      0
FamilyStatus_unknown       0
dtype: int64


Unnamed: 0,Age,Fare,Name,Parch,PassengerId,SibSp,Survived,FamilySize,Wealth,Family,...,Title_Lady,Title_Mr,Title_Mrs,Title_Sir,Embarked_C,Embarked_Q,Embarked_S,FamilyStatus_perished,FamilyStatus_survived,FamilyStatus_unknown
0,22,7.25,"Braund, Mr. Owen Harris",0,1,1,0,2,159.5,Small,...,0,1,0,0,0,0,1,0,0,1
1,38,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,2,2708.7654,Small,...,0,0,1,0,1,0,0,0,0,1
2,26,7.925,"Heikkinen, Miss. Laina",0,3,0,1,1,206.05,Small,...,0,0,0,0,0,0,1,0,0,1
3,35,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,2,1858.5,Small,...,0,0,1,0,0,0,1,0,0,1
4,35,8.05,"Allen, Mr. William Henry",0,5,0,0,1,281.75,Small,...,0,1,0,0,0,0,1,0,0,1


In [258]:
#Split back the data set into train and test sets
train_data = raw_data_onehot[raw_data_onehot["PassengerId"] < 892]



In [259]:
test_data = raw_data_onehot[raw_data_onehot["PassengerId"] >= 892]

In [260]:
#Fix class imbalance on training set
survivors = train_data[train_data["Survived"] == 1]
non_survivors = train_data[train_data["Survived"] == 0]

print "Before Balance"
print "--------------"
print "% Survived: ", len(survivors) / float(len(train_data["Survived"]))
print "% Not Survived: ", len(non_survivors) / float(len(train_data["Survived"]))
print "--------------"

ratio = len(survivors) / float(len(non_survivors))
non_survivors = non_survivors.sample(frac=ratio)
train_data_bal = survivors.append(non_survivors)

survivors = train_data_bal[train_data_bal["Survived"] == 1]
non_survivors = train_data_bal[train_data_bal["Survived"] == 0]

print "After Balance"
print "--------------"
print "% Survived: ", len(survivors) / float(len(train_data_bal["Survived"]))
print "% Not Survived: ", len(non_survivors) / float(len(train_data_bal["Survived"]))
print "--------------"


Before Balance
--------------
% Survived:  0.383838383838
% Not Survived:  0.616161616162
--------------
After Balance
--------------
% Survived:  0.5
% Not Survived:  0.5
--------------


In [261]:
train_data.head()

Unnamed: 0,Age,Fare,Name,Parch,PassengerId,SibSp,Survived,FamilySize,Wealth,Family,...,Title_Lady,Title_Mr,Title_Mrs,Title_Sir,Embarked_C,Embarked_Q,Embarked_S,FamilyStatus_perished,FamilyStatus_survived,FamilyStatus_unknown
0,22,7.25,"Braund, Mr. Owen Harris",0,1,1,0,2,159.5,Small,...,0,1,0,0,0,0,1,0,0,1
1,38,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,2,2708.7654,Small,...,0,0,1,0,1,0,0,0,0,1
2,26,7.925,"Heikkinen, Miss. Laina",0,3,0,1,1,206.05,Small,...,0,0,0,0,0,0,1,0,0,1
3,35,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,2,1858.5,Small,...,0,0,1,0,0,0,1,0,0,1
4,35,8.05,"Allen, Mr. William Henry",0,5,0,0,1,281.75,Small,...,0,1,0,0,0,0,1,0,0,1


In [262]:
train_data.columns.values

array(['Age', 'Fare', 'Name', 'Parch', 'PassengerId', 'SibSp', 'Survived',
       'FamilySize', 'Wealth', 'Family', 'Sex_female', 'Sex_male',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Title_Child', 'Title_Lady',
       'Title_Mr', 'Title_Mrs', 'Title_Sir', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'FamilyStatus_perished', 'FamilyStatus_survived',
       'FamilyStatus_unknown'], dtype=object)

In [263]:
features = ['Age', 
            'Fare', 
            'FamilySize', 
            'Sex_female', 
            'Sex_male', 
            'Pclass_1', 
            'Pclass_2',
            'Pclass_3', 
            'Title_Child', 
            'Title_Lady', 
            'Title_Mr', 
            'Title_Mrs', 
            'Title_Sir', 
            'Wealth',
            'FamilyStatus_perished', 
            'FamilyStatus_survived',
            'FamilyStatus_unknown']

In [264]:
from sklearn.grid_search import GridSearchCV
from sklearn import tree

param_grid = {"max_depth": [3, 6, 9, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10, 15],
              "min_samples_leaf": [1, 3, 10],
              "criterion": ["gini", "entropy"]}
#Train decision tree
from sklearn import tree
dtree = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(dtree, param_grid=param_grid)
grid_search.fit(train_data[features], train_data["Survived"])
report(grid_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.848 (std: 0.017)
Parameters: {'max_features': 10, 'min_samples_split': 1, 'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 1}

Model with rank: 2
Mean validation score: 0.845 (std: 0.012)
Parameters: {'max_features': 10, 'min_samples_split': 10, 'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1}

Model with rank: 3
Mean validation score: 0.845 (std: 0.010)
Parameters: {'max_features': 10, 'min_samples_split': 10, 'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 3}



In [265]:
from sklearn import metrics
from sklearn import cross_validation
tree_best = tree.DecisionTreeClassifier(max_features = 10, min_samples_split = 1, criterion ='gini', max_depth = 6, min_samples_leaf = 1)
tree_best_scores = cross_validation.cross_val_score(tree_best, train_data[features], train_data["Survived"], cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (tree_best_scores.mean(), tree_best_scores.std() * 2))

Accuracy: 0.84 (+/- 0.06)


In [266]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree

dtree = tree.DecisionTreeClassifier()

param_grid = {"base_estimator__max_depth": [1, 3, 5, None],
              "base_estimator__max_features": [1, 3, 10],
              "base_estimator__min_samples_split": [1, 3, 10, 15],
              "base_estimator__min_samples_leaf": [1, 3, 10],
              "base_estimator__criterion": ["gini", "entropy"]}
boost_tree = AdaBoostClassifier(dtree, n_estimators=20)
grid_search = GridSearchCV(boost_tree, param_grid=param_grid)
grid_search.fit(train_data[features], train_data["Survived"])
report(grid_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.847 (std: 0.014)
Parameters: {'base_estimator__min_samples_split': 15, 'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 3, 'base_estimator__min_samples_leaf': 1, 'base_estimator__max_features': 1}

Model with rank: 2
Mean validation score: 0.845 (std: 0.007)
Parameters: {'base_estimator__min_samples_split': 15, 'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 3, 'base_estimator__min_samples_leaf': 10, 'base_estimator__max_features': 1}

Model with rank: 3
Mean validation score: 0.844 (std: 0.008)
Parameters: {'base_estimator__min_samples_split': 1, 'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 1, 'base_estimator__min_samples_leaf': 3, 'base_estimator__max_features': 1}



In [268]:
from sklearn.ensemble import AdaBoostClassifier

dtree = tree.DecisionTreeClassifier(max_features = 1, 
                                    min_samples_split = 15, 
                                    criterion ='gini', 
                                    max_depth = 3, 
                                    min_samples_leaf = 1)
boosted_tree_best = AdaBoostClassifier(dtree, n_estimators=100)
boosted_tree_best_scores = cross_validation.cross_val_score(boosted_tree_best, train_data[features], target, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (boosted_tree_best_scores.mean(), boosted_tree_best_scores.std() * 2))

Accuracy: 0.83 (+/- 0.08)


In [269]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
from sklearn.grid_search import GridSearchCV
param_grid = {"max_depth": [1, 3, 6, None],
              "max_features": [1, 3, 6, 10],
              "min_samples_split": [1, 3, 6, 10, 15],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

grid_search = GridSearchCV(rf, param_grid=param_grid)
grid_search.fit(train_data[features], train_data["Survived"])
report(grid_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.864 (std: 0.018)
Parameters: {'bootstrap': False, 'min_samples_leaf': 3, 'min_samples_split': 15, 'criterion': 'gini', 'max_features': 1, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.863 (std: 0.017)
Parameters: {'bootstrap': True, 'min_samples_leaf': 3, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': 3, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.863 (std: 0.017)
Parameters: {'bootstrap': True, 'min_samples_leaf': 3, 'min_samples_split': 10, 'criterion': 'entropy', 'max_features': 3, 'max_depth': None}



In [273]:
from sklearn import metrics
from sklearn import cross_validation
rf_best = RandomForestClassifier(bootstrap = True,
                                 min_samples_leaf= 3, 
                                 min_samples_split=3, 
                                 criterion='entropy', 
                                 max_features= 3, 
                                 max_depth=None)
rf_best_scores = cross_validation.cross_val_score(rf_best, train_data[features], train_data["Survived"], cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (rf_best_scores.mean(), rf_best_scores.std() * 2))

Accuracy: 0.85 (+/- 0.08)


In [274]:
rf_best.fit(train_data[features],train_data["Survived"])
test_predictions = rf_best.predict(test_data[features])

In [275]:
import csv as csv
predictions_file = open("rf_new.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(test_data["PassengerId"].values, test_predictions.astype(int)))
predictions_file.close()