In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix

In [2]:
# deterministic random data
np.random.seed(42)

In [3]:
#Loading the training data
train = pd.read_csv("train.csv", index_col = "PassengerId")

In [4]:
#Loading the test data
test = pd.read_csv("test.csv", index_col = "PassengerId")

In [5]:
data = pd.concat([train, test], keys=["train", "test"])

In [6]:
data.describe()
#Some info are missing (age:714, cabin:204, embarked:889)

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,Survived
count,1046.0,1308.0,1309.0,1309.0,1309.0,891.0
mean,29.881138,33.295479,0.385027,2.294882,0.498854,0.383838
std,14.413493,51.758668,0.86556,0.837836,1.041658,0.486592
min,0.17,0.0,0.0,1.0,0.0,0.0
25%,21.0,7.8958,0.0,2.0,0.0,0.0
50%,28.0,14.4542,0.0,3.0,0.0,0.0
75%,39.0,31.275,0.0,3.0,1.0,1.0
max,80.0,512.3292,9.0,3.0,8.0,1.0


# Fixing Fare

In [7]:
data[data['Fare'].isnull()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
Unnamed: 0_level_1,PassengerId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
test,1044,60.5,,S,,"Storey, Mr. Thomas",0,3,male,0,,3701


In [8]:
data[(data["Pclass"] == 3) & (data["Embarked"] == "S")]["Fare"].median()

8.05

In [9]:
data.loc[data['Fare'].isnull(), "Fare"] = 8.05

In [10]:
data.loc["test"].loc[1044]

Age                       60.5
Cabin                      NaN
Embarked                     S
Fare                      8.05
Name        Storey, Mr. Thomas
Parch                        0
Pclass                       3
Sex                       male
SibSp                        0
Survived                   NaN
Ticket                    3701
Name: 1044, dtype: object

# Fixing Embarked

In [11]:
data[(data['Embarked'].isnull())]

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
Unnamed: 0_level_1,PassengerId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
train,62,38.0,B28,,80.0,"Icard, Miss. Amelie",0,1,female,0,1.0,113572
train,830,62.0,B28,,80.0,"Stone, Mrs. George Nelson (Martha Evelyn)",0,1,female,0,1.0,113572


In [12]:
data[data["Cabin"].str.startswith('B', na=False)][["Cabin", "Embarked"]].sort_values(['Cabin','Embarked'], ascending=[True,True])

Unnamed: 0_level_0,Unnamed: 1_level_0,Cabin,Embarked
Unnamed: 0_level_1,PassengerId,Unnamed: 2_level_1,Unnamed: 3_level_1
test,1058,B10,C
train,738,B101,C
train,816,B102,S
test,1107,B11,S
train,330,B18,C
train,524,B18,C
train,171,B19,S
train,691,B20,S
train,782,B20,S
train,541,B22,S


In [13]:
data.loc[data['Embarked'].isnull(), "Embarked"] = "S"

In [14]:
data.loc["train"].loc[[62,830]]

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
62,38.0,B28,S,80.0,"Icard, Miss. Amelie",0,1,female,0,1.0,113572
830,62.0,B28,S,80.0,"Stone, Mrs. George Nelson (Martha Evelyn)",0,1,female,0,1.0,113572


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1309 entries, (train, 1) to (test, 1309)
Data columns (total 11 columns):
Age         1046 non-null float64
Cabin       295 non-null object
Embarked    1309 non-null object
Fare        1309 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Survived    891 non-null float64
Ticket      1309 non-null object
dtypes: float64(3), int64(3), object(5)
memory usage: 126.7+ KB


# Fixing Age

In [16]:
data[(data['Age'].isnull())]

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
Unnamed: 0_level_1,PassengerId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
train,6,,,Q,8.4583,"Moran, Mr. James",0,3,male,0,0.0,330877
train,18,,,S,13.0000,"Williams, Mr. Charles Eugene",0,2,male,0,1.0,244373
train,20,,,C,7.2250,"Masselmani, Mrs. Fatima",0,3,female,0,1.0,2649
train,27,,,C,7.2250,"Emir, Mr. Farred Chehab",0,3,male,0,0.0,2631
train,29,,,Q,7.8792,"O'Dwyer, Miss. Ellen ""Nellie""",0,3,female,0,1.0,330959
train,30,,,S,7.8958,"Todoroff, Mr. Lalio",0,3,male,0,0.0,349216
train,32,,B78,C,146.5208,"Spencer, Mrs. William Augustus (Marie Eugenie)",0,1,female,1,1.0,PC 17569
train,33,,,Q,7.7500,"Glynn, Miss. Mary Agatha",0,3,female,0,1.0,335677
train,37,,,C,7.2292,"Mamee, Mr. Hanna",0,3,male,0,1.0,2677
train,43,,,C,7.8958,"Kraeff, Mr. Theodor",0,3,male,0,0.0,349253


In [17]:
data[(data['Age'].notnull())].groupby(['Parch', 'Sex'])["Age"].mean()

Parch  Sex   
0      female    31.116379
       male      32.338619
1      female    25.608434
       male      24.272727
2      female    20.645593
       male      16.383684
3      female    39.000000
       male      38.500000
4      female    44.666667
       male      52.000000
5      female    39.250000
       male      39.500000
6      female    43.000000
       male      40.000000
Name: Age, dtype: float64

In [18]:
data[(data['Age'].isnull())].groupby(['Parch', 'Sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Cabin,Embarked,Fare,Name,Pclass,SibSp,Survived,Ticket
Parch,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,female,0,5,61,61,61,61,61,41,61
0,male,0,16,173,173,173,173,173,116,173
1,female,0,2,5,5,5,5,5,5,5
1,male,0,0,5,5,5,5,5,3,5
2,female,0,0,10,10,10,10,10,7,10
2,male,0,0,6,6,6,6,6,5,6
4,female,0,0,1,1,1,1,1,0,1
9,female,0,0,1,1,1,1,1,0,1
9,male,0,0,1,1,1,1,1,0,1


In [19]:
selection = (data['Age'].isnull()) & (data['Parch'] == 0) & (data['Sex'] == 'female')
data.loc[selection, "Age"] = 31.116379
selection = (data['Age'].isnull()) & (data['Parch'] == 0) & (data['Sex'] == 'male')
data.loc[selection, "Age"] = 32.338619
selection = (data['Age'].isnull()) & (data['Parch'] == 1) & (data['Sex'] == 'female')
data.loc[selection, "Age"] = 25.608434
selection = (data['Age'].isnull()) & (data['Parch'] == 1) & (data['Sex'] == 'male')
data.loc[selection, "Age"] = 24.272727
selection = (data['Age'].isnull()) & (data['Parch'] == 2) & (data['Sex'] == 'female')
data.loc[selection, "Age"] = 20.645593
selection = (data['Age'].isnull()) & (data['Parch'] == 2) & (data['Sex'] == 'male')
data.loc[selection, "Age"] = 16.383684
selection = (data['Age'].isnull()) & (data['Parch'] == 4) & (data['Sex'] == 'female')
data.loc[selection, "Age"] = 44.666667
selection = (data['Age'].isnull()) & (data['Parch'] == 9) & (data['Sex'] == 'female')
data.loc[selection, "Age"] = 43.000000
selection = (data['Age'].isnull()) & (data['Parch'] == 9) & (data['Sex'] == 'male')
data.loc[selection, "Age"] = 40.000000

# DF to Numpy

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1309 entries, (train, 1) to (test, 1309)
Data columns (total 11 columns):
Age         1309 non-null float64
Cabin       295 non-null object
Embarked    1309 non-null object
Fare        1309 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Survived    891 non-null float64
Ticket      1309 non-null object
dtypes: float64(3), int64(3), object(5)
memory usage: 126.7+ KB


In [21]:
dropped_features = ["Name","Cabin","Ticket"]
data_encoded = data.drop(dropped_features, 1)
data_encoded = pd.get_dummies(data_encoded)

In [22]:
train_encoded = data_encoded.loc["train"].copy()
test_encoded = data_encoded.loc["test"].copy()

labels = train_encoded["Survived"]

train_encoded.drop(["Survived"], 1, inplace=True)
test_encoded.drop(["Survived"], 1, inplace=True)

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier


from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFECV

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

def test_models(X, y):
    clf_1 = RandomForestClassifier(random_state=42)
    rfecv = RFECV(estimator=clf_1, cv=5, scoring='f1')
    scores = cross_val_score(clf_1, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
    print("\nRandomForestClassifier:%d" % rfecv.n_features_)
    display_scores(scores)
    
    clf_2 = ExtraTreesClassifier(random_state=42)
    rfecv = RFECV(estimator=clf_2, cv=5, scoring='f1')
    scores = cross_val_score(clf_2, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
    print("\nExtraTreesClassifier:%d" % rfecv.n_features_)
    display_scores(scores)
    
    clf_3 = GradientBoostingClassifier(random_state=42)
    rfecv = RFECV(estimator=clf_3, cv=5, scoring='f1')
    scores = cross_val_score(clf_3, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
    print("\nGradientBoostingClassifier:%d" % rfecv.n_features_)
    display_scores(scores)
    
    clf_4 = AdaBoostClassifier(random_state=42)
    rfecv = RFECV(estimator=clf_4, cv=5, scoring='f1')
    scores = cross_val_score(clf_4, rfecv.fit_transform(X, y), y, scoring="f1", cv=5)
    print("\nAdaBoostClassifier:%d" % rfecv.n_features_)
    display_scores(scores)
    
    eclf2 = VotingClassifier(estimators=[('random', clf_1), ('extra', clf_2), ('gnb', clf_3), ('ada', clf_4)],voting='soft')
    scores = cross_val_score(eclf2, X, y, scoring="f1", cv=5)
    print("\nVoting")
    display_scores(scores)

In [24]:
test_models(train_encoded, labels)


RandomForestClassifier:6
Scores: [ 0.67692308  0.74074074  0.79069767  0.72580645  0.81481481]
Mean: 0.749796551702
Standard deviation: 0.0487203290598

ExtraTreesClassifier:6
Scores: [ 0.70072993  0.74820144  0.75590551  0.71317829  0.77941176]
Mean: 0.739485387389
Standard deviation: 0.0287532762882

GradientBoostingClassifier:9
Scores: [ 0.70866142  0.7518797   0.784       0.70967742  0.78787879]
Mean: 0.748419464761
Standard deviation: 0.0343995183393

AdaBoostClassifier:10
Scores: [ 0.70344828  0.75177305  0.75555556  0.76190476  0.78787879]
Mean: 0.752112086169
Standard deviation: 0.0274054458458

Voting
Scores: [ 0.75912409  0.74242424  0.81818182  0.72440945  0.82014388]
Mean: 0.772856696382
Standard deviation: 0.0393758795658


In [25]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
def best_slice_model(clf, X, y):
    best_accuracy = -1
    best_clf = clf
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        l_accuracy = accuracy_score(y_test, clf.predict(X_test))
        if l_accuracy > best_accuracy:
            best_accuracy = l_accuracy
            best_clf = clf
            print(best_clf.__class__.__name__, " Best accuracy:",best_accuracy)
    return best_clf

In [26]:
def build_voting_clf(X, y):
    clf_1 = RandomForestClassifier(random_state=42)
    clf_1 = best_slice_model(clf_1, X, y)
    
    clf_2 = ExtraTreesClassifier(random_state=42)
    clf_2 = best_slice_model(clf_2, X, y)
    
    clf_3 = GradientBoostingClassifier(random_state=42)
    clf_3 = best_slice_model(clf_3, X, y)
    
    clf_4 = AdaBoostClassifier(random_state=42)
    clf_4 = best_slice_model(clf_4, X, y)
    
    eclf2 = VotingClassifier(estimators=[('random', clf_1), ('extra', clf_2), ('gnb', clf_3), ('ada', clf_4)],voting='soft')
    eclf2.fit(X, y)#Is this really necessary?
    return eclf2
    

In [27]:
clf = build_voting_clf(train_encoded.as_matrix(), labels.as_matrix())

RandomForestClassifier  Best accuracy: 0.782122905028
RandomForestClassifier  Best accuracy: 0.803370786517
RandomForestClassifier  Best accuracy: 0.837078651685
RandomForestClassifier  Best accuracy: 0.842696629213
ExtraTreesClassifier  Best accuracy: 0.776536312849
ExtraTreesClassifier  Best accuracy: 0.808988764045
GradientBoostingClassifier  Best accuracy: 0.798882681564
GradientBoostingClassifier  Best accuracy: 0.831460674157
GradientBoostingClassifier  Best accuracy: 0.859550561798
AdaBoostClassifier  Best accuracy: 0.765363128492
AdaBoostClassifier  Best accuracy: 0.820224719101
AdaBoostClassifier  Best accuracy: 0.859550561798


# Voting has the best results.

In [28]:
clf = VotingClassifier(estimators=[('random', RandomForestClassifier(random_state=42)), ('extra', ExtraTreesClassifier(random_state=42)), ('gnb', GradientBoostingClassifier(random_state=42)), ('ada', AdaBoostClassifier(random_state=42))],voting='soft')
clf.fit(train_encoded, labels)

VotingClassifier(estimators=[('random', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            mi...thm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=42))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [29]:
test["Survived"] = clf.predict(test_encoded)
test['Survived'].to_csv("result_2.csv")

# Setting some hyperparameters
work in progress...

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'random__max_features': [0.2, 0.4, 0.6], 'random__criterion': ['gini', 'entropy'], 'random__bootstrap': [True, False], 'random__n_estimators': [10,15,20,25],
         'extra__max_features': [0.2, 0.4, 0.6], 'extra__criterion': ['gini', 'entropy'], 'extra__bootstrap': [True, False], 'extra__n_estimators': [10,15,20,25],
         'gnb__max_features': [0.2, 0.4, 0.6], 'gnb__loss': ['deviance', 'exponential'], 'gnb__learning_rate': [0.9, 0.1,0.15,0.2], 'gnb__n_estimators': [100,120,140,150],}

In [None]:
grid_search = GridSearchCV(clf, params, cv=5,scoring='f1')
grid_search.fit(train_encoded, labels)
grid_search.best_estimator_

In [None]:
test["Survived"] = grid_search.best_estimator_.predict(test_encoded)
test['Survived'].to_csv("result_2.csv")