In [1]:
import numpy as np
import pandas as pd

# Load Dataset

In [2]:
train = pd.read_csv("../data/train.csv", index_col = "PassengerId")

print(train.shape)
train.head(1)

(891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [3]:
test = pd.read_csv("../data/test.csv", index_col = "PassengerId")

print(test.shape)
test.head(1)

(418, 10)


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q


# Preprocessing

In [4]:
# merge train and test dataset

combi = pd.concat([train, test])

print(combi.shape)
combi.head(1)

(1309, 11)


Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171


In [5]:
# encode Sex

combi["Sex_encode"] = (combi["Sex"] == "male").astype(int)

print(combi.shape)
combi[["Sex", "Sex_encode"]].head(2)

(1309, 12)


Unnamed: 0_level_0,Sex,Sex_encode
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,1
2,female,0


In [6]:
# encode Embarked

embarked = pd.get_dummies(combi["Embarked"], prefix="Embarked").astype(np.bool)

combi = pd.concat([combi, embarked], axis=1)

print(combi.shape)
combi[["Embarked", "Embarked_C", "Embarked_Q", "Embarked_S"]].head()

(1309, 15)


Unnamed: 0_level_0,Embarked,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,S,False,False,True
2,C,True,False,False
3,S,False,False,True
4,S,False,False,True
5,S,False,False,True


In [7]:
combi["Family"] = combi["SibSp"] + combi["Parch"]

print(combi.shape)
combi.head(2)

(1309, 16)


Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Sex_encode,Embarked_C,Embarked_Q,Embarked_S,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,1,False,False,True,1
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599,0,True,False,False,1


In [8]:
# fill out NaN fare

mean_fare = train["Fare"].mean()

print("mean fare = ${mean_fare:.3f}".format(mean_fare=mean_fare))

mean fare = $32.204


In [9]:
combi["Fare_fillout"] = combi["Fare"]

combi.loc[pd.isnull(combi["Fare"]), "Fare_fillout"] = mean_fare

missing_fare = combi[pd.isnull(combi["Fare"])]

In [10]:
# split dataset into train

train = combi[pd.notnull(combi["Survived"])]

train.head(1)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Sex_encode,Embarked_C,Embarked_Q,Embarked_S,Family,Fare_fillout
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,1,False,False,True,1,7.25


In [11]:
# split dataset into test

test = combi[pd.isnull(combi["Survived"])]

test.drop("Survived", axis=1, inplace=True)

test.head(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Sex_encode,Embarked_C,Embarked_Q,Embarked_S,Family,Fare_fillout
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
892,34.5,,Q,7.8292,"Kelly, Mr. James",0,3,male,0,330911,1,False,True,False,0,7.8292


# Score

In [12]:
# make prediction model through decision tree using train.csv
# predict score using test.csv
# note: cross validation

feature_names = ["Pclass", "Sex_encode", "Fare_fillout", "Embarked_C", "Embarked_Q", "Embarked_S", "Family"]

X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(891, 7)


Unnamed: 0_level_0,Pclass,Sex_encode,Fare_fillout,Embarked_C,Embarked_Q,Embarked_S,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,1,7.25,False,False,True,1
2,1,0,71.2833,True,False,False,1
3,3,0,7.925,False,False,True,0
4,1,0,53.1,False,False,True,1
5,3,1,8.05,False,False,True,0


In [13]:
label_name = "Survived"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

(891,)


PassengerId
1    0.0
2    1.0
3    1.0
4    1.0
5    0.0
Name: Survived, dtype: float64

In [14]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=5,
                               random_state=37)
model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=37, splitter='best')

In [15]:
from sklearn.cross_validation import cross_val_score

score = cross_val_score(model, X_train, y_train, cv=100).mean()
print("Score = {score:.5f}".format(score=score))



Score = 0.80753


# Predict

In [16]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head(1)

(418, 7)


Unnamed: 0_level_0,Pclass,Sex_encode,Fare_fillout,Embarked_C,Embarked_Q,Embarked_S,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,1,7.8292,False,True,False,0


In [17]:
# mean_fare = train["Fare"].mean()
# X_test.loc[pd.isnull(X_test["Fare"]), "Fare"] = mean_fare

model.fit(X_train, y_train)

prediction = model.predict(X_test)

print(prediction.shape)
prediction[:20]

(418,)


array([ 0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  0.,  1.])

# Submit

In [18]:
submission = pd.read_csv("../data/gender_submission.csv", index_col = "PassengerId")

submission["Survived"] = prediction.astype(np.int32)

print(submission.shape)
submission.head()

(418, 1)


Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [19]:
submission.to_csv("../data/baseline-script-rachel-submission")