nyc16_ids2 Project - examination of the Kaggle Titanic Data
Ron Haynes

The purpose of this analysis is to examine the Titanic data, exercise some of the practices put forth in the class and put forth a prediction of survivability for members of the test sample.  In the end a decision tree and random forest approach were used.

In [86]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
# Supress this warning msg
pd.options.mode.chained_assignment = None  # default='warn'

In [87]:
# Load train and test files from Kaggle
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
print train.head()
print test.head()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [88]:
# Examine the data 
print test.shape
print test.describe()
print train.shape
print train.describe()

(418, 11)
       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576
min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000
25%     996.250000    1.000000         NaN    0.000000    0.000000         NaN
50%    1100.500000    3.000000         NaN    0.000000    0.000000         NaN
75%    1204.750000    3.000000         NaN    1.000000    0.000000         NaN
max    1309.000000    3.000000   76.000000    8.000000    9.000000  512.329200
(891, 12)
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1

In [89]:
#print train.iloc[124]
#Drop Ticket, Cabin as they add little and we want to remove na's in the future.
train = train.drop(['Ticket','Cabin'], axis=1)
#Confirm they are gone
print train.iloc[124]

PassengerId                            125
Survived                                 0
Pclass                                   1
Name           White, Mr. Percival Wayland
Sex                                   male
Age                                     54
SibSp                                    0
Parch                                    1
Fare                               77.2875
Embarked                                 S
Name: 124, dtype: object


In [90]:
#Prep the fare column
train.Fare = train.Fare.map(lambda x: np.nan if x==0 else x)
classmeans = train.pivot_table('Fare', index='Pclass', aggfunc='mean')
train.Fare = train[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )
    
# Overwrite missing values in Fare and Age with the median
meanAge=np.mean(train.Age)
train.Age=train.Age.fillna(meanAge)
meanFare=np.mean(train.Fare)
train.Fare=train.Age.fillna(meanFare)

In [91]:
# Let's check the file and drop na's
print train.shape
train = train.dropna()
train.shape

(891, 10)


(889, 10)

In [92]:
# Check who survived
print(train.Survived.value_counts())

0    549
1    340
Name: Survived, dtype: int64


In [93]:
# By Gender
print(train["Survived"][train["Sex"] == 'male'].value_counts())
print(train["Survived"][train["Sex"] == 'female'].value_counts())

0    468
1    109
Name: Survived, dtype: int64
1    231
0     81
Name: Survived, dtype: int64


In [94]:
# Create a child column
train["Child"] = float('NaN')
#print(train["Child"])
train["Child"][train["Age"] < 18] = 1
train["Child"][train["Age"] >= 18] = 0
print(train["Child"])
print(train["Survived"][train["Child"] == 1].value_counts(normalize = True))

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      1.0
8      0.0
9      1.0
10     1.0
11     0.0
12     0.0
13     0.0
14     1.0
15     0.0
16     1.0
17     0.0
18     0.0
19     0.0
20     0.0
21     0.0
22     1.0
23     0.0
24     1.0
25     0.0
26     0.0
27     0.0
28     0.0
29     0.0
      ... 
861    0.0
862    0.0
863    0.0
864    0.0
865    0.0
866    0.0
867    0.0
868    0.0
869    1.0
870    0.0
871    0.0
872    0.0
873    0.0
874    0.0
875    1.0
876    0.0
877    0.0
878    0.0
879    0.0
880    0.0
881    0.0
882    0.0
883    0.0
884    0.0
885    0.0
886    0.0
887    0.0
888    0.0
889    0.0
890    0.0
Name: Child, dtype: float64
1    0.539823
0    0.460177
Name: Survived, dtype: float64


In [95]:
# Convert gender to 1/0
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
#print train

In [96]:
# prep the Embarked column
train["Embarked"] = train["Embarked"].fillna("S")
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
print(train["Embarked"])

0      0
1      1
2      0
3      0
4      0
5      2
6      0
7      0
8      0
9      1
10     0
11     0
12     0
13     0
14     0
15     0
16     2
17     0
18     0
19     1
20     0
21     0
22     2
23     0
24     0
25     0
26     1
27     0
28     2
29     0
      ..
861    0
862    0
863    0
864    0
865    0
866    1
867    0
868    0
869    0
870    0
871    0
872    0
873    0
874    1
875    1
876    0
877    0
878    0
879    1
880    0
881    0
882    0
883    0
884    0
885    2
886    0
887    0
888    0
889    1
890    2
Name: Embarked, dtype: object


In [97]:
# Create the target and features arrays
target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values

In [98]:
print features_one

[[3L 0 22.0 22.0]
 [1L 1 38.0 38.0]
 [3L 1 26.0 26.0]
 ..., 
 [3L 1 29.69911764705882 29.69911764705882]
 [1L 0 26.0 26.0]
 [3L 0 32.0 32.0]]


In [99]:
# Create a decision tree
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)

# importance and score of the features used
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))

[ 0.17868518  0.45085127  0.19915046  0.17131309]
0.879640044994


In [100]:
#Apply the same to the Test file
test = test.drop(['Ticket','Cabin'], axis=1)
test = test.dropna()
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
test["Embarked"] = test["Embarked"].fillna("S")
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

test_features = test[["Pclass", "Sex", "Age", "Fare"]].values
print test_features

[[3L 0 34.5 7.8292]
 [3L 1 47.0 7.0]
 [2L 0 62.0 9.6875]
 ..., 
 [3L 1 28.0 7.775]
 [1L 1 39.0 108.9]
 [3L 0 38.5 7.25]]


In [101]:
# predict using the test set
my_prediction = my_tree_one.predict(test_features)
print(my_prediction)

[0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1
 0 1 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 0 0 1 1 0 0 0 0 0
 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 1 0
 1 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0
 1 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 1 1 1 1 0 0 1 1 0 1
 1 0 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 1 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 1 0]


In [102]:
# Stage data with id and survived
PassengerId =np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
print(my_solution)

      Survived
892          0
893          0
894          0
895          1
896          0
897          0
898          0
899          0
900          0
901          0
903          0
904          1
905          0
906          1
907          1
908          0
909          0
910          0
911          0
912          0
913          0
915          0
916          1
917          0
918          1
919          0
920          0
922          0
923          0
924          0
...        ...
1273         0
1275         1
1277         0
1278         0
1279         0
1280         0
1281         1
1282         0
1283         1
1284         1
1285         0
1286         1
1287         1
1288         0
1289         1
1290         0
1291         1
1292         1
1293         0
1294         1
1295         0
1296         0
1297         0
1298         0
1299         0
1301         0
1303         1
1304         0
1306         1
1307         0

[331 rows x 1 columns]


In [103]:
# add in 3 more columns and put some overfitting steps in there
features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)
print(my_tree_two.score(features_two, target))
# Check with previous
print(my_tree_one.score(features_one, target))

0.892013498313
0.879640044994


In [104]:
# Try RF with Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked columns
features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
my_forest = forest.fit(features_forest, target)
print(my_forest.score(features_forest, target))
print test

0.923509561305
     PassengerId  Pclass                                               Name  \
0            892       3                                   Kelly, Mr. James   
1            893       3                   Wilkes, Mrs. James (Ellen Needs)   
2            894       2                          Myles, Mr. Thomas Francis   
3            895       3                                   Wirz, Mr. Albert   
4            896       3       Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
5            897       3                         Svensson, Mr. Johan Cervin   
6            898       3                               Connolly, Miss. Kate   
7            899       2                       Caldwell, Mr. Albert Francis   
8            900       3          Abrahim, Mrs. Joseph (Sophie Halaut Easu)   
9            901       3                            Davies, Mr. John Samuel   
11           903       1                         Jones, Mr. Charles Cresson   
12           904       1      Snyder,

In [105]:
# Show predictions and the length of the prediction vector
test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
pred_forest = my_forest.predict(test_features)
print(my_tree_two.feature_importances_)
print(my_forest.feature_importances_)

[ 0.14812203  0.1772275   0.42762713  0.08934661  0.07807205  0.04448119
  0.03512349]
[ 0.14202514  0.1686441   0.35909531  0.16621386  0.07390739  0.0465136
  0.0436006 ]


In [106]:
# Compare to tree 2 result
print(my_tree_two.score(features_two, target))
print(my_forest.score(features_forest, target))

0.892013498313
0.923509561305
