## File load and data overview

In [1]:
import pandas as pd

test_file = r'C:\Users\tomek\Documents\titanic_test.csv'
train_file = r'C:\Users\tomek\Documents\titanic_train.csv'

test_data = pd.read_csv(test_file)
train_data = pd.read_csv(train_file)


In [2]:
train_data.shape

(891, 12)

In [3]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
train_data.head(n = 20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Since Age could be an important feature and it is not 100% complete, I've decided to replace NaN values with mean value for each sex

#### Mean age for each sex:

In [6]:
male_age = train_data.loc[(train_data.Sex == 'male') & (train_data.Age.notnull())]
male_age_round = male_age['Age'].mean().round()

male_age_round

31.0

In [7]:
female_age = train_data.loc[(train_data.Sex == 'female') & (train_data.Age.notnull())]
female_age_round = female_age['Age'].mean().round()

female_age_round

28.0

#### Replacing null Age values

In [8]:
train_data.loc[(train_data.Sex == 'male') & (train_data.Age.isna()), 'Age'] = 31

In [9]:
train_data.loc[(train_data.Sex == 'female') & (train_data.Age.isna()), 'Age'] = 28

In [10]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.779091,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.017438,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,30.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Decision tree model

In [11]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

In [12]:
train_y = train_data.Survived

features_set1 = ['Pclass', 'Age', 'Fare']
features_set2 = ['Pclass', 'Age', 'Parch', 'Fare', 'SibSp']

train_X = train_data[features_set2]

titanic_model = DecisionTreeRegressor(random_state = 1)

titanic_model.fit(train_X, train_y)

In [15]:
test_X = test_data[features_set2]

In [16]:
y_test_pred = titanic_model.predict(test_X).round()

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_test_pred.astype(int)})
output.to_csv('titanic_model.csv', index = False)