In [78]:
#Predicting Survival in the Titanic Data Set

#Import the required libraries
import pandas as pd
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import KFold

In [80]:
#Read the dataset
url= "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"
titanic = pd.read_csv(url)
titanic['Sex_Label'], _ = pd.factorize(titanic['Sex'])
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_Label
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [81]:
#Print the columns in dataset
print("The columns in titanic dataset are: ", titanic.columns)
print("The columns with missing values are:", titanic.isnull().sum())
#The relevant columns to be used in Prediction-Pclass, Sex, Age, SibSp(Siblings aboard), Parch(Parents/children aboard) and Fare
#Fill the mmissing values for relevant column - Age
titanic['Age'].fillna(titanic['Age'].mean(), inplace = True)
print("After updating missing values:")
titanic.isnull().sum()

The columns in titanic dataset are:  Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex_Label'],
      dtype='object')
The columns with missing values are: PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Sex_Label        0
dtype: int64
After updating missing values:


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Sex_Label        0
dtype: int64

In [82]:
#Select only the relevant columns for prediction
y = titanic['Survived']
X = titanic[['Pclass', 'Sex_Label', 'Age', 'SibSp', 'Parch', 'Fare']]

In [83]:
#Split data randomly into 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [84]:
#Train the decision tree model and make predictions
dtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
dtree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [85]:
#Use the model to make predictions with the test data
y_pred = dtree.predict(X_test)
print("The predicted values are: ", y_pred)

The predicted values are:  [0 0 0 1 1 0 1 1 0 1 0 1 0 1 1 1 0 0 0 1 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 0 0 0
 0 1 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0
 1 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 1 1 0 1
 1 0 0 1 1 0 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0
 0 1 0 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1
 1 0 1 0 0 1 0 0 0 1 1 1 0 0 1 1 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0
 0 0 0 0 0 0 0 1 0]


In [86]:
#Evaluate the model performance
count_misclassified = (y_test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 48
Accuracy: 0.82


In [87]:
#Perform cross validation
cv = KFold(n=len(X), # Number of elements
n_folds=10, # Desired number of cv folds
random_state=12)

fold_accuracy = []
# titanic_train["Sex"] = encoded_sex
for train_fold, valid_fold in cv:
    train = X.loc[train_fold] # Extract train data with cv indices
    valid = X.loc[valid_fold] # Extract valid data with cv indices
    train_y = y.loc[train_fold]
    valid_y = y.loc[valid_fold]
    model = dtree.fit(X = train, y = train_y)
    valid_acc = model.score(X = valid, y = valid_y)
    fold_accuracy.append(valid_acc)
print("Accuracy per fold: ", fold_accuracy, "\n")
print("Average accuracy: ", sum(fold_accuracy)/len(fold_accuracy))

Accuracy per fold:  [0.82222222222222219, 0.8539325842696629, 0.7752808988764045, 0.8539325842696629, 0.7865168539325843, 0.8089887640449438, 0.7752808988764045, 0.7752808988764045, 0.88764044943820219, 0.8314606741573034] 

Average accuracy:  0.817053682896
