In [8]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score

In [9]:
# Loads the data for train and test.
train_orig = pd.read_csv('data/train.csv')
test_orig = pd.read_csv('data/test.csv')
test_orig.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,Unnamed: 12
0,1284,3,"Abbott, Master. Eugene Joseph",male,13.0,0,2,C.A. 2673,20.25,,S,0,
1,1237,3,"Abelseth, Miss. Karen Marie",female,16.0,0,0,348125,7.65,,S,1,
2,949,3,"Abelseth, Mr. Olaus Jorgensen",male,25.0,0,0,348122,7.65,F G63,S,1,
3,1143,3,"Abrahamsson, Mr. Abraham August Johannes",male,20.0,0,0,SOTON/O2 3101284,7.925,,S,1,
4,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,1,


In [10]:
# Remove last column
test_orig = test_orig.iloc[: , :-1]
test_orig.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1284,3,"Abbott, Master. Eugene Joseph",male,13.0,0,2,C.A. 2673,20.25,,S,0
1,1237,3,"Abelseth, Miss. Karen Marie",female,16.0,0,0,348125,7.65,,S,1
2,949,3,"Abelseth, Mr. Olaus Jorgensen",male,25.0,0,0,348122,7.65,F G63,S,1


In [11]:
train = train_orig.copy() 
test =  test_orig.copy()

dataset_list = [train, test]

for dataset in dataset_list:
    # Fix cabin feature
    dataset['Has_Cabin'] = dataset["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    # Family = siblings + parents + you
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    # New feature traveling alone
    dataset['TravelingAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'TravelingAlone'] = 1
    # Fill the embarked=null with most common.
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    # Fill the null fares with the median fare
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
    # generate random numbers around mean for age (mixing genders here)
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    # Spreading one standard deviation.
    new_age_values = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = new_age_values
    dataset['Age'] = dataset['Age'].astype(int)
    # If not know fill with male
    dataset['Sex'] = dataset['Sex'].fillna('male')
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    dataset.loc[(dataset['Age'] <= 12), 'Age'] = 0
    dataset.loc[(dataset['Age'] > 12) & (dataset['Age'] <= 24), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 24) & (dataset['Age'] <= 36), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 48), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 48), 'Age'] = 4

train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_Cabin,FamilySize,TravelingAlone
0,1,0,3,"Braund, Mr. Owen Harris",1,1,1,0,A/5 21171,7.2500,,0,0,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,3,1,0,PC 17599,71.2833,C85,1,1,2,0
2,3,1,3,"Heikkinen, Miss. Laina",0,2,0,0,STON/O2. 3101282,7.9250,,0,0,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,2,1,0,113803,53.1000,C123,0,1,2,0
4,5,0,3,"Allen, Mr. William Henry",1,2,0,0,373450,8.0500,,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,2,0,0,211536,13.0000,,0,0,1,1
887,888,1,1,"Graham, Miss. Margaret Edith",0,1,0,0,112053,30.0000,B42,0,1,1,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,2,1,2,W./C. 6607,23.4500,,0,0,4,0
889,890,1,1,"Behr, Mr. Karl Howell",1,2,0,0,111369,30.0000,C148,1,1,1,1


In [12]:
fields_to_drop = ['Name', 'PassengerId', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(fields_to_drop, axis = 1)
test  = test.drop(fields_to_drop, axis = 1)


In [102]:
test.isnull().any()

Pclass            False
Sex               False
Age               False
Parch             False
Fare              False
Embarked          False
Survived          False
Has_Cabin         False
FamilySize        False
TravelingAlone    False
dtype: bool

In [13]:
# Allocate the model building object
tree_model = tree.DecisionTreeClassifier(max_depth = 3)
# Define the predictors and answers.
predictors_train = train.drop(['Survived'], axis=1) # Also called X
answers_train = train["Survived"] # Also called y

# Train the model
model = tree_model.fit(predictors_train, answers_train)

# Evaluate the model or score against the test set.
test_X = test.drop(['Survived'], axis=1)
test_y = test["Survived"]
accuracy = model.score(test_X, test_y) 

In [14]:
print('Model Accuracy is {0:.2F}%'.format(accuracy * 100))

Model Accuracy is 75.12%
