# A Random Forest Model

In [9]:
# third party
import numpy
import pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# Loading the data

In [28]:
titanic_data = pandas.read_csv("training_cleaned.csv")
titanic_data.describe()

         Survived      Pclass       SibSp       Parch        Fare      Gender  \
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000   
mean     0.383838    2.308642    0.523008    0.381594   32.204208    0.647587   
std      0.486592    0.836071    1.102743    0.806057   49.693429    0.477990   
min      0.000000    1.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.000000    2.000000    0.000000    0.000000    7.910400    0.000000   
50%      0.000000    3.000000    0.000000    0.000000   14.454200    1.000000   
75%      1.000000    3.000000    1.000000    0.000000   31.000000    1.000000   
max      1.000000    3.000000    8.000000    6.000000  512.329200    1.000000   

          AgeFill  EmbarkedCode  FamilySize   AgeXClass  
count  891.000000    891.000000  891.000000  891.000000  
mean    29.112424      0.361392    0.904602   62.614860  
std     13.304424      0.635673    1.613459   31.362024  
min      0.420000      0.000000    0.0

In [39]:
target = titanic_data.Survived.values
training_data = titanic_data[titanic_data.columns[1:]].values
print(target.shape)
print(training_data.shape)

(891,)
(891, 9)


In [30]:
testing_frame = pandas.read_csv("testing_cleaned.csv")

In [31]:
testing_frame.head()

   Pclass  SibSp  Parch     Fare  Gender  AgeFill  EmbarkedCode  FamilySize  \
0     3.0    0.0    0.0   7.8292     1.0     34.5           2.0         0.0   
1     3.0    1.0    0.0   7.0000     0.0     47.0           0.0         1.0   
2     2.0    0.0    0.0   9.6875     1.0     62.0           2.0         0.0   
3     3.0    0.0    0.0   8.6625     1.0     27.0           0.0         0.0   
4     3.0    1.0    1.0  12.2875     0.0     22.0           0.0         2.0   

   AgeXClass  
0      103.5  
1      141.0  
2      124.0  
3       81.0  
4       66.0  

In [41]:
testing_data = testing_frame.values
testing_data.shape

(418, 9)

# Building the Model

In [17]:
model = RandomForestClassifier(n_estimators=100)

To see how well this model does I'll use the default cross-validation scores.

In [32]:
scores = cross_val_score(model, training_data, target)

In [37]:
mean = scores.mean()
interval = scores.std() * 2
print("95% Accuracy Confidence Interval: ({0:.2f} - {1:.2f})".format(mean - interval,
                                                                     mean + interval))

95% Accuracy Confidence Interval: (0.76 - 0.85)


As noted in the data-cleaning notebook, guessing that females lived and males died would have given us an accuracy of about 74% so there's a slight improvement.

In [42]:
model.fit(training_data, target)
output = model.predict(testing_data)

# Preparing the output

It turns out that kaggle wants the PassengerId column.

In [43]:
unclean = pandas.read_csv('test.csv')

In [44]:
output_data = pandas.DataFrame({"PassengerId": unclean.PassengerId, "Survived": output})
# if you upload floats the tester will mark them all wrong
output_data = output_data.astype(int)

In [45]:
output_data.to_csv("forest_model.csv", index=False)

In [51]:
!head forest_model.csv

PassengerId,Survived
892,0
893,0
894,0
895,1
896,0
897,0
898,0
899,0
900,1


In [50]:
output_data.head()

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         1
4          896         0

Uploading this to kaggle showed an Accuracy of 0.7464 on their test-set.