In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
import re

# deterministic random data
np.random.seed(42)

In [5]:
def getNameTitle(name):
    m = re.search('\w+, ([\w ]+)\.', name)
    return m.group(1)

In [21]:
#Loading the training data
train = pd.read_csv("train.csv", index_col = "PassengerId")

In [22]:
#Loading the test data
test = pd.read_csv("test.csv", index_col = "PassengerId")

In [23]:
data = pd.concat([train, test], keys=["train", "test"])

In [24]:
data["Title"] = data.Name.apply(getNameTitle)

In [25]:
data.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title
Unnamed: 0_level_1,PassengerId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
train,1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,Mr
train,2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599,Mrs


In [26]:
dropped_features = ["Cabin", "Embarked", "Name", "Ticket"]

In [27]:
data_encoded = data.drop(dropped_features, 1)
data_encoded = pd.get_dummies(data_encoded)

median = data_encoded["Age"].median()
data_encoded["Age"].fillna(median, inplace=True)

median = data_encoded["Fare"].median()
data_encoded["Fare"].fillna(median, inplace=True)


In [28]:
data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1309 entries, (train, 1) to (test, 1309)
Data columns (total 26 columns):
Age                   1309 non-null float64
Fare                  1309 non-null float64
Parch                 1309 non-null int64
Pclass                1309 non-null int64
SibSp                 1309 non-null int64
Survived              891 non-null float64
Sex_female            1309 non-null uint8
Sex_male              1309 non-null uint8
Title_Capt            1309 non-null uint8
Title_Col             1309 non-null uint8
Title_Don             1309 non-null uint8
Title_Dona            1309 non-null uint8
Title_Dr              1309 non-null uint8
Title_Jonkheer        1309 non-null uint8
Title_Lady            1309 non-null uint8
Title_Major           1309 non-null uint8
Title_Master          1309 non-null uint8
Title_Miss            1309 non-null uint8
Title_Mlle            1309 non-null uint8
Title_Mme             1309 non-null uint8
Title_Mr              1309 non-

In [36]:
train_encoded = data_encoded.loc["train"].copy()
test_encoded = data_encoded.loc["test"].copy()

In [37]:
labels = train_encoded["Survived"]
train_encoded.drop(["Survived"], 1, inplace=True)
test_encoded.drop(["Survived"], 1, inplace=True)

In [38]:
test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 25 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
Parch                 418 non-null int64
Pclass                418 non-null int64
SibSp                 418 non-null int64
Sex_female            418 non-null uint8
Sex_male              418 non-null uint8
Title_Capt            418 non-null uint8
Title_Col             418 non-null uint8
Title_Don             418 non-null uint8
Title_Dona            418 non-null uint8
Title_Dr              418 non-null uint8
Title_Jonkheer        418 non-null uint8
Title_Lady            418 non-null uint8
Title_Major           418 non-null uint8
Title_Master          418 non-null uint8
Title_Miss            418 non-null uint8
Title_Mlle            418 non-null uint8
Title_Mme             418 non-null uint8
Title_Mr              418 non-null uint8
Title_Mrs             418 non-null uint8
Title_Ms              418 n

In [39]:
train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 25 columns):
Age                   891 non-null float64
Fare                  891 non-null float64
Parch                 891 non-null int64
Pclass                891 non-null int64
SibSp                 891 non-null int64
Sex_female            891 non-null uint8
Sex_male              891 non-null uint8
Title_Capt            891 non-null uint8
Title_Col             891 non-null uint8
Title_Don             891 non-null uint8
Title_Dona            891 non-null uint8
Title_Dr              891 non-null uint8
Title_Jonkheer        891 non-null uint8
Title_Lady            891 non-null uint8
Title_Major           891 non-null uint8
Title_Master          891 non-null uint8
Title_Miss            891 non-null uint8
Title_Mlle            891 non-null uint8
Title_Mme             891 non-null uint8
Title_Mr              891 non-null uint8
Title_Mrs             891 non-null uint8
Title_Ms              891 non-

In [40]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFECV

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def test_models(X, y):
    sgd_clf = SGDClassifier(random_state=42)
    rfecv = RFECV(estimator=sgd_clf, cv=5, scoring='f1')
    scores = cross_val_score(sgd_clf, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
    print("\nSGDClassifier - features:%d" % rfecv.n_features_)
    display_scores(scores)

    forest_clf = RandomForestClassifier(random_state=42)
    rfecv = RFECV(estimator=forest_clf, cv=5, scoring='f1')
    scores = cross_val_score(forest_clf, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
    print("\nRandomForestClassifier:%d" % rfecv.n_features_)
    display_scores(scores)

    log_reg = LogisticRegression(random_state=42)
    rfecv = RFECV(estimator=log_reg, cv=5, scoring='f1')
    scores = cross_val_score(log_reg, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
    print("\nLogisticRegression:%d" % rfecv.n_features_)
    display_scores(scores)

    softmax_reg = LogisticRegression(solver="lbfgs", C=5, random_state=42)
    rfecv = RFECV(estimator=softmax_reg, cv=5, scoring='f1')
    scores = cross_val_score(softmax_reg, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
    print("\nsoftmax_reg-LogisticRegression:%d" % rfecv.n_features_)
    display_scores(scores)

    xgb_clf = xgb.XGBClassifier(seed = 42)
    rfecv = RFECV(estimator=xgb_clf, cv=5, scoring='f1')
    scores = cross_val_score(xgb_clf, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
    print("\nXGBClassifier:%d" % rfecv.n_features_)
    display_scores(scores)
    
    neu_clf = MLPClassifier(random_state=42)
    scores = cross_val_score(neu_clf, X, y, scoring="f1", cv=5)
    print("\nMLPClassifier")
    display_scores(scores)
    
scaler = StandardScaler()



In [41]:
train_prepared= scaler.fit_transform(train_encoded)
test_models(train_prepared, labels)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



SGDClassifier - features:19
Scores: [ 0.72368421  0.71755725  0.72857143  0.68148148  0.77697842]
Mean: 0.725654557951
Standard deviation: 0.0305396659711

RandomForestClassifier:17
Scores: [ 0.77272727  0.71641791  0.81481481  0.656       0.82517483]
Mean: 0.757026964633
Standard deviation: 0.0633807684777

LogisticRegression:6
Scores: [ 0.80882353  0.76119403  0.74418605  0.66666667  0.80597015]
Mean: 0.757368084339
Standard deviation: 0.0518169367866

softmax_reg-LogisticRegression:6
Scores: [ 0.80882353  0.76119403  0.74418605  0.66666667  0.80597015]
Mean: 0.757368084339
Standard deviation: 0.0518169367866

XGBClassifier:10
Scores: [ 0.78195489  0.75968992  0.8         0.75409836  0.78832117]
Mean: 0.776812867648
Standard deviation: 0.0173532044613





MLPClassifier
Scores: [ 0.69767442  0.72180451  0.74603175  0.6779661   0.82089552]
Mean: 0.73287446
Standard deviation: 0.0495930158195


# XGBClassifier has the best results.
Time to run some GridSearchCV to find the hyperparameters

In [42]:
xgb_clf = xgb.XGBClassifier(seed = 42)
rfecv = RFECV(estimator=xgb_clf, cv=5, scoring='f1')
rfecv.fit(train_prepared, labels)
xgb_clf.fit(rfecv.transform(train_prepared), labels)
f1_score(xgb_clf.predict(rfecv.transform(train_prepared)), labels)

0.83281733746130038

In [43]:
test["Survived"] = xgb_clf.predict(rfecv.transform(scaler.fit_transform(test_encoded)))
test['Survived'].to_csv("result_2.csv")