# Predicting Survival in the Titanic Data Set

We will be using a decision tree to make predictions about the Titanic data set from Kaggle. This data set provides information on the Titanic passengers and can be used to predict whether a passenger survived or not.



In [1]:
import numpy as np
import pandas as pd

# Reading the titanic dataset from csv file

In [2]:
titanic_data=pd.read_csv('train.csv') #the titanic dataset which is downloaded from kaggle as train.csv file
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


# preprocessing the data

In [4]:
titanic_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

From the above information we can see that 'Age' and 'cabin' columns are having null values.since cabin column as almost null values we do not use this column.then we try to remove the null values in 'Age' column by replacing all zeros with median value.

In [5]:
age_median=titanic_data['Age'].median()

In [6]:
titanic_data.Age.fillna(age_median,inplace=True)

In [7]:
titanic_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
cat_cols = titanic_data.select_dtypes(include='object').columns.values
cat_cols

array(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype=object)

In [9]:
# Encoding the Sex columns values into 0 and 1 and creating a new column with those values
titanic_data['Sex_new'] = titanic_data['Sex'].replace({'female':0, 'male': 1})

In [10]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_new
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


some of the least significant features are eliminated by droping the columns

In [11]:
titanic_data.drop(columns=['Sex','PassengerId','Name','Ticket','Cabin','Embarked'],axis=1,inplace=True)

In [12]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_new
0,0,3,22.0,1,0,7.25,1
1,1,1,38.0,1,0,71.2833,0
2,1,3,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,0,3,35.0,0,0,8.05,1


In [13]:
X = titanic_data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_new']]
y=titanic_data['Survived']

# splitting the data into training and test data

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit the model to training dataset

In [15]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy', max_depth=7, random_state=0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

# Predicting the Y using predict method on test data

In [16]:
y_pred=classifier.predict(X_test)

# Accuracy checking of model using confusion matrix and acuracy score

In [19]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_pred,y_test)
cm

array([[137,  25],
       [ 31,  75]], dtype=int64)

In [21]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.7910447761194029