In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:

import matplotlib as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
path='../input/'

In [None]:
train=pd.read_csv(path+'train.csv')
test=pd.read_csv(path+'test.csv')

In [None]:
# save the passenger id for the final submission
passengerId=test.PassengerId

# merge train and test
titanic = train.append(test, ignore_index=True)

## we use the ignore_index as in test data we have the labels columns which is not present in the train data.

In [None]:
train_id=len(train)
test_id=len(titanic)-len(test)

In [None]:
train_id

In [None]:
test_id

In [None]:
len(titanic)

In [None]:
len(test)

In [None]:
titanic.head()

In [None]:
titanic.info()

It looks like we have a few NaNs in the dataset across a few features. We will use the data to try and fill in the gaps. The info() method reveals that the Age, Cabin, Embarked, and Fare all have a few entries missing. Technically the Survived column also has entries missing, but this is actually correct since we merged the train and test together for future feature engineering and the test data doesn't have a Survived column.

Additionally, from looking at the features, it looks like we can just drop PassengerId from the dataset all together since it isn't really a helpful feature, but rather simply a row identifier.

In [None]:
titanic.drop(['PassengerId'],1,inplace=True)

In [None]:
titanic.head()

Now we create a title feature which extracts the honorifc from the Name feature.Simply put, an honorific is the title or rank of a given person such as “Mrs” or “Miss”. The following code takes a value like “Braund, Mr. Owen Harris” from the Name column and extracts “Mr”.

In [None]:
titanic['Title']=titanic.Name.apply(lambda name:name.split(',')[1].split('.')[0].strip() )

In [None]:
titanic.head()

In [None]:
## title counts
#print("There are {} unique title.".format(titanic.Title.nunique))
print("There are {} unique titles.".format(titanic.Title.nunique()))
print("\n", titanic.Title.unique())

In [None]:
titanic.head()

In [None]:
# normalize the titles
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

def convert(val):
    return normalized_titles[val]

In [None]:
titanic.head()

In [None]:
type(titanic.Title.values[0])

In [None]:
# view value counts for the normalized titles
print(titanic.Title.value_counts())


In [None]:
titanic.Title = titanic.Title.map(normalized_titles)


In [None]:
titanic.head()

In [None]:
# view value counts for the normalized titles
print(titanic.Title.value_counts())

For our next step, we are going to assume that their is a relationship between a person's age and their title since it makes sense that someone that is younger is more likely to be a titled a "Miss" vs a "Mrs".

With this in mind, we will group the data by Sex, Pclass, and Title and then view the median age for the grouped classes.

In [None]:
#groupby sex,Pclass and Title
grouped=titanic.groupby(['Sex','Pclass','Title'])
grouped.Age.median()

As expected, those passengers with a title of "Miss" tend to be younger than those titled "Mrs". Also, it looks like we have some age variability amongst the different passenger classes as well as between the sexes, so this should help us more accurately estimate the missing ages for the observations that do not have an age recorded.

In [None]:
## applying the grouped median age value
titanic.Age=grouped.Age.apply(lambda x:x.fillna(x.median()))

titanic.info()

In [None]:
titanic.head(10)

In [None]:
titanic.Cabin=titanic.Cabin.fillna('NA')     ## NA-not available

In [None]:
titanic.head()

In [None]:
titanic.Embarked.value_counts()

In [None]:
most_embarked=titanic.Embarked.value_counts().index[0]

In [None]:
most_embarked

In [None]:
titanic.Embarked=titanic.Embarked.fillna(most_embarked)

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
##only fare is left incomplete
titanic.Fare=titanic.Fare.fillna(titanic.Fare.median())

titanic.info()

In [None]:
##percentage of death vs percentage of survival
titanic.Survived.value_counts()

In [None]:
titanic.Survived.value_counts(normalize=True)

In [None]:
## lets dig deeper and determine the survival rates based on the gender
groupbysex=titanic.groupby(['Sex'])
groupbysex.Survived.value_counts(normalize=True)

In [None]:
##survival rates based on their sex
groupbysex.Survived.mean()

For those who have seen the fateful story of titanic we know that the women and children were given priority oven men.Even though it is very astounding that only 19% of the men survived compared the 75% women.

In [None]:
## group by passenge Pclass and sex
group_class_sex=titanic.groupby(['Pclass','Sex'])
group_class_sex.Survived.mean()


It appears that 1st class females had an incredible 97% survival rate while 1st class males only still had a 37% chance of survival. Even though you only had a 37% chance of surviving as a 1st class male, you still were almost 3 times more likely to survive than a 3rd class male who had the lowest survival rate amongst sex and class at 13.5%.

The social status gives us a pretty good idea about the survival chance.

In [None]:
##get stats on all other metrics
titanic.describe()

# Creating new features from the data

The first feature we will look at building is FamilySize. This is important to look at because we want to see if having a large or small family affected someone's chances of survival.

The relevant features that will help with this are Parch (number of parents/children aboard) and SibSp (number of siblings/spouses aboard). We combine the Parch and SibSp features and add 1 as well as we want to count the passenger for each observation.

In [None]:
## size of the family including the passenger.
titanic['FamilySize']=titanic['Parch']+titanic['SibSp']+1

 we can also generate info from the Cabin as cabins near the life boats will have higher chance of suvival compared to the others located elsewhere.So we extract the first letter from the cabin and generate features.

In [None]:
## map the first letter of the cabin to the cabin.
titanic.Cabin=titanic.Cabin.map(lambda x:x[0])

## view the normalized count
titanic.Cabin.value_counts(normalize=True)

In [None]:
titanic.head()

In [None]:
def handle_non_numeric_data(df):
	columns=df.columns.values
	for column in columns:
		text_digit_vals={}
		def convert_to_int(val):
			return text_digit_vals[val] 

		if df[column].dtype!= np.int64 and df[column].dtype!= np.float64:
			column_contents=df[column].values.tolist()		#.values is used to get the values of a function
			unique_elements=set(column_contents)	#converting to a set
			x=0
					
			for unique in unique_elements:
				if unique not in text_digit_vals:
					text_digit_vals[unique]=x
					x+=1

			df[column]=list(map(convert_to_int,df[column]))		#we are resetting the df column by mapping the function here to the value in the column

	return df

In [None]:
titanic=handle_non_numeric_data(titanic)

In [None]:
titanic.head()

In [None]:
train=titanic[:train_id]
test=titanic[test_id:]

In [None]:
## convert the survived back to int
train.Suvived=train.Survived.astype(int)

In [None]:
train.head()

# Modelling

In [None]:
# create X and y for data and target values
X = train.drop('Survived', axis=1).values
y = train.Survived.values

In [None]:
test.head()

In [None]:
X_test=test.drop('Survived',1).values

# Logistic Regression

The first model we will try is a Logistic Regression model which is a binary classifier algorithm. We will be using GridSearchCV to fit our model by specifying a few paramters and return the best possible combination of those parameters.

In [None]:
# The parameters that we are going to optimise
parameters = dict(
    C = np.logspace(-5, 10, 15),
    penalty = ['l1', 'l2']
    #solver =[‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’]
    
)

In [None]:
## instantiate the logistic regression
clf=LogisticRegression()

# Perform grid search using the parameters and f1_scorer as the scoring method
grid_search=GridSearchCV(estimator=clf,param_grid=parameters,cv=6,n_jobs=-1)
# here cv is used for the cross-validation strategy.


In [None]:
grid_search.fit(X,y)

In [None]:
clf1=grid_search.best_estimator_        # get the best estimator(classifier)
print(clf1)

In [None]:
# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(grid_search.best_params_)) 
print("Best score is {}".format(grid_search.best_score_))

In [None]:
## prediction on test set
pred=grid_search.predict(X_test)
print(pred)

# Random Forest Model

The best score using logistic regression was ~82% which wasn't bad. But let's see how we can fare with a Random Forrest Classifier algorithm instead.

In [None]:
# create param grid object
forrest_params = dict(
    max_depth = [n for n in range(7, 14)],
    min_samples_split = [n for n in range(4, 12)],
    min_samples_leaf = [n for n in range(2, 6)],
    n_estimators = [n for n in range(10, 60, 10)],
)

In [None]:
forest=RandomForestClassifier()

In [None]:
# build and fit model
forest_cv = GridSearchCV(estimator=forest, param_grid=forrest_params, cv=5)
forest_cv.fit(X, y)

In [None]:
print("Best score: {}".format(forest_cv.best_score_))
print("Optimal params: {}".format(forest_cv.best_estimator_))

In [None]:
# random forrest prediction on test set
forrest_pred = forest_cv.predict(X_test)

Random forest classifier has a better accuracy than the logistic regression as deduced above.

# For submission on kaggle

In [None]:
sub=pd.DataFrame({'PassengerId':passengerId,'Survived':forrest_pred})

In [None]:
sub.head()

In [None]:
sub.to_csv('prediction.csv',index=False)   
## we initialise the index as false as we donot need the index