# Predicting Survival on Titanic Passenger Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [70]:
dfTitanicTrain = pd.read_csv('C:/Users/c708682/Downloads/train.csv')
dfTitanicTest = pd.read_csv('C:/Users/c708682/Downloads/test.csv')
dfTitanicFull = pd.concat([dfTitanicTrain, dfTitanicTest])

In [71]:
dfTitanicFull.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171
1,38,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599
2,26,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282
3,35,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803
4,35,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0,373450


## Missing values analysis

In [72]:
dfTitanicFull['Pclass'].isnull().sum()

0

In [73]:
dfTitanicFull['Sex'].isnull().sum()

0

In [74]:
dfTitanicFull['Age'].isnull().sum()

263

In [75]:
dfTitanicFull['SibSp'].isnull().sum()

0

In [76]:
dfTitanicFull['Parch'].isnull().sum()

0

In [77]:
dfTitanicFull['Fare'].isnull().sum()

1

In [78]:
dfTitanicFull['Cabin'].isnull().sum()

1014

In [79]:
dfTitanicFull['Embarked'].isnull().sum()

2

The numeric variable 'Age' has many missing values, which must be replaced. I replace it with the median of the corresponding passenger class group

In [80]:
med = dfTitanicFull.groupby('Pclass')['Age'].transform('median')

In [81]:
dfTitanicFull['Age'] = dfTitanicFull['Age'].fillna(med)

In [82]:
# Test again
dfTitanicFull['Age'].isnull().sum()

0

In [83]:
# transform sex to integer

dfTitanicFull['Sex'].replace('female' , 1, inplace=True)
dfTitanicFull['Sex'].replace('male' , 0, inplace=True)

In [84]:
# transform Embarked to integer

dfTitanicFull['Embarked'].replace('S' , 0, inplace=True)
dfTitanicFull['Embarked'].replace('C' , 1, inplace=True)
dfTitanicFull['Embarked'].replace('Q' , 2, inplace=True)

In [85]:
# replace missing Embarked with 0 because it's the most frequent value
dfTitanicFull['Embarked'] = dfTitanicFull['Embarked'].fillna(0)

In [86]:
# The numeric variable 'Fare' has two missing values, which must be replaced. I replace it with the median
# of the corresponding Embarked group because fare is most likely dependent on the embarkement place
mean = dfTitanicFull.groupby('Embarked')['Fare'].transform('mean')

dfTitanicFull['Fare'] = dfTitanicFull['Fare'].fillna(mean)

In [98]:
# split the data in train and test again

dfTitanicTrain = dfTitanicFull[0:890]
dfTitanicTest = dfTitanicFull[891:1309]

## Linear Regression Model


In [99]:
# Linear Regression model with all variables

from sklearn.linear_model import LinearRegression

# Remove non int vars
X_Train = dfTitanicTrain.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin' ], axis = 1)
X_Test = dfTitanicTest.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin' ], axis = 1)
Y_Train = dfTitanicTrain.Survived

Test again if there are all missing values replaced

In [100]:
X_Test.isnull().any()

Age         False
Embarked    False
Fare        False
Parch       False
Pclass      False
Sex         False
SibSp       False
dtype: bool

In [101]:
X_Train.isnull().any()

Age         False
Embarked    False
Fare        False
Parch       False
Pclass      False
Sex         False
SibSp       False
dtype: bool

linear regression with all variables:

In [102]:
lm = LinearRegression()
lm.fit(X_Train, Y_Train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [103]:
print 'Estimated Intercept Coefficient:', lm.intercept_
print 'Number of coefficients:', len(lm.coef_)
print 'R Square:', lm.score(X_Train, Y_Train)

pd.DataFrame(zip(X_Train.columns, lm.coef_), columns = ['features', 'estimatedCoefficients'])

Estimated Intercept Coefficient: 0.824752141216
Number of coefficients: 7
R Square: 0.3988549194


Unnamed: 0,features,estimatedCoefficients
0,Age,-0.006239
1,Embarked,0.035877
2,Fare,0.000303
3,Parch,-0.015302
4,Pclass,-0.186385
5,Sex,0.502778
6,SibSp,-0.043555


predicting survival with the test data:

In [104]:
# predictors with train data
lm.predict(X_Train)
# using these predictors on the test data  
prediction_Test = lm.predict(X_Test)


building the solution dataframe with two variables, passengerID and survived out of the test data as outcome:

In [105]:
solution = pd.DataFrame({'PassengerId' : dfTitanicTest.PassengerId, 
                         'Survived' : prediction_Test})

In [106]:
solution['Survived'] =  np.where(solution['Survived'] < 0.5, 0,1)


In [107]:
solution.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [108]:
solution.to_csv('C:/Users/c708682/Downloads/submission1.csv', sep=',', header=True, index=False)