In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Load training and test data

In [None]:
training = pd.read_csv('/kaggle/input/titanic/train.csv')
training.head()

In [None]:
testing = pd.read_csv('/kaggle/input/titanic/test.csv')
testing.head()

Check % of survivors by gender

In [None]:
women = training.loc[training.Sex == 'female']['Survived']
rate_women = sum(women)/len(women)
print(rate_women, "% of women survived")

In [None]:
men = training.loc[training.Sex == 'male']['Survived']
rate_men = sum(men)/len(men)
print(rate_men, "% of men survived")

Random Forest Model

In [None]:
#from sklearn.ensemble import RandomForestClassifier

#y = training["Survived"]

#features = ["Pclass", "Sex", "SibSp", "Parch"]
#X = pd.get_dummies(training[features])
#X_test = pd.get_dummies(testing[features])

#model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
#model.fit(X, y)
#predictions = model.predict(X_test)

#output = pd.DataFrame({'PassengerId': testing.PassengerId, 'Survived': predictions})
#output.to_csv('my_submission.csv', index=False)
#print("Your submission was successfully saved!")
print("lines commented for future submission")

The above code results in 75% correct predictions.  Looking through the given dataset some basic EDA could be done to increase the accuracy.  There are a lot of missing values.  Additionally it might help to figure out names in a family.

In order to refine results I start following the https://www.kaggle.com/allohvk/captivating-conversations-with-the-titanic-dataset guide which gives insight into techniques to improve the fit.

In [None]:
training.info()
#This line reveals that the age, cabin, and embarked fields are missing values.  The age column in particular is missing many values.

In [None]:
testing.info()

Find the number of passengers in each class and then a percentage of each class that survived

In [None]:
global blankline
blankline = '\n*****************************\n'
print(training.groupby('Pclass').count())
print(blankline, training [['Pclass','Survived']].groupby('Pclass').mean())

This shows that first class passengers had the highest survival rate by a large margin.
We already know approximately 74% of female passengers survived but let's check survival by gender and ticket class.

In [None]:
print(round(training [['Sex', 'Pclass','Survived']].\
                       groupby(['Pclass', 'Sex']).mean()*100,1))

First and second class females had a very high chance of survival compared to any other group.
Next a few visualizations would be helpful in understanding different relationships in the data.
The first graph will visualize how age affects survival rates

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
print(training['Age'].plot(kind='hist',bins=10))

Knowing females were more likely to survive than males it is important to know the age distribution by gender as well.

In [None]:
plt.figure()
sns.kdeplot(training[training.Sex=='female']['Age'], color="green")
sns.kdeplot(training[training.Sex=='male']['Age'], color="red", shade=True)

This graph shows young people are more likely to survive and young females are especially likely to survive.  Next it will be helpful to break up the data by passenger class.  To do this we can use a violin plot of age and p-class.  Green represents survival while red is death.

In [None]:
plt.figure() 
sns.violinplot(x='Pclass', y='Age', hue='Survived', split=True ,data=training, palette={0: "r", 1: "g"});

This graph is gives us an interesting understanding of survival rates based on age and ticket class.  

In [None]:
print('Pclass 1 survivors above Age 60:', round(len(training[(training['Pclass']==1) & \
    (training['Age']>59) & (training['Survived']==True)])/len(training[(training\
    ['Pclass']==1) & (training['Age']>59)])*100,1), '%')
print('Pclass 2 survivors above Age 60:', round(len(training[(training['Pclass']==2) & \
    (training['Age']>59) & (training['Survived']==True)])/len(training[(training \
    ['Pclass']==2) & (training['Age']>59)])*100,1), '%')
print('Pclass 3 survivors above Age 60:', round(len(training[(training['Pclass']==3) & \
    (training['Age']>59) & (training['Survived']==True)])/len(training[(training \
    ['Pclass']==3) & (training['Age']>59)])*100,1), '%')

print('Pclass1 survivors between 20-30 Age:',round(len(training[(training['Pclass']==1) \
    &(training['Age']>19) & (training['Age']<31) & (training['Survived']==True)])/len( \
    training[(training['Pclass']==1) & (training['Age']>19) \
    & (training['Age']<31)])*100,1),'%')
print('Pclass2 survivors between 20-30 Age:',round(len(training[(training['Pclass']==2) \
    &(training['Age']>19) & (training['Age']<31) &(training['Survived']==True)])/len( \
    training[(training['Pclass']==2)&(training['Age']>19) \
    &(training['Age']<31)])*100,1),'%')
print('Pclass3 survivors between 20-30 Age:',round(len(training[(training['Pclass']==3) \
    &(training['Age']>19) & (training['Age']<31) &(training['Survived']==True)])/len( \
    training[(training['Pclass']==3) & (training['Age']>19) \
    &(training['Age']<31)])*100,1),'%')

This shows the previous breakdown a little more efficiently demonstrating that first class individuals are very likely to live especially if between the ages of 20-30.  Generally all passengers between 20-30 are more likely to survive than older individuals within their age group.

Next it is helpful to expand on the previous graphs and explore the survivors in each age range categorized by gender.  While it is safe to assume more females will survive in each class it might be helpful to know to what degree.

In [None]:
training['PclassSex'] = training['Pclass'].astype(str) + training['Sex'] 


plt.figure(figsize=(15,8))  

sns.violinplot(x='PclassSex', y='Age', hue='Survived', split=True,data=training, cut=0, \
    palette={0: "r", 1: "g"}, order=['1male','2male','3male', '1female', '2female', '3female'])

What this shows is that class 1 females are very likely to survive regardless of age, although being between 20 and 40 is still helpful.  Class 1 males between 20 and 50 seem just as likely to survive as they are to die.  Class 2 males have almost no chance of surviving unless under 10, and class 3 males and females have similar distributions. What is really notable is the difference in pclass 2 and 3 male survival rates in this violin plot.  Let's dissect this quickly.

In [None]:
print('Pclass 2 adult male survivors:',round(len(training[(training['Pclass']==2) & \
        (training['Age']>19) & (training['Sex']=='male') & (training['Survived'] \
        == True)])/len(training[(training['Pclass']==2) & (training['Age']>19) & \
        (training['Sex']=='male')])*100,1),'%')

print('Pclass 3 adult male survivors:',round(len(training[(training['Pclass']==3) & \
        (training['Age']>19) & (training['Sex']=='male') & (training['Survived'] \
        == True)])/len(training[(training['Pclass']==3) & (training['Age']>19) & \
        (training['Sex']=='male')])*100,1),'%')

The fact that male survival rates in third class are twice as high as in second class is very interesting.
The next category to investigate is the embarked column.

In [None]:
plt.figure()
fg = sns.FacetGrid(training, row='Embarked', aspect=2)
fg.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', \
       hue_order=['female', 'male'], order=[1,2,3], palette={'female': "r", 'male': "g"})
fg.add_legend()

Survival rates seem fairly consistent across embarked ports.
The next category to consider is fare.

In [None]:
print(training['Fare'].describe())

There are people on the boat who did not pay a fare.

In [None]:
print(training[(training['Fare']==0)])
#these are noticably all middle age males so some of these are probably the cabin crew.  

In [None]:
print(training[(training['Fare']==0)].groupby('Pclass').agg('count'))
#This shows there are 0 cost fares in each pclass

print(training[(training['Fare']==0)].groupby(['Pclass', 'Ticket']).agg('count'))
#All pclass 3 costless tickets have the ticket number LINE which is probably the cabin crew with pclass 3.
#There are missing age values that will need filled and the rest of the fare = 0 should be treated as missing data.


In [None]:
plt.hist([training[training['Survived'] == 1]['Fare'], training[training['Survived'] == 0]['Fare']],
stacked=True, color = ['g','r'], bins = 50, label = ['Survived','Dead'])
plt.xlabel('Fare')
plt.ylabel('Number of passengers')
plt.legend();

#This histogram of fare survival rates demonstrates that higher fares provide higher survival rates, but pclass already shows this well
#It might help to expand on the low fare category

print(len(training[(training['Fare']<7.1) & (training['Fare']>0)]), blankline)
print(training[(training['Fare']<7.1) & (training['Fare']>0)].agg('mean'), blankline)
print(training[(training['Pclass']==3)&(training['Sex']=='male')].agg('mean'),blankline)

#These 23 individuals had a 4.3% survival rate which is abnormally low when it should be around 13.5% as we saw earlier.
#The age is also statistically significant and can be separated and investigated.  

print(training[(training['Fare']<9)&(training['Age']<14)])
print(len(training[(training['Fare']<9)]))
print(len(training[(training['Age']<14)]))

#Out of the 311 people with fare < 9 only 2 of them are children and the rest are amoung the rest of the passengers.
#This means that any missing ages within the 311 low fare passengers belongs to an adult.  Let's narrow down what we can use to fill the ages.
#First lets see how many people in the group are elderly and then how many of them are missing values.

print(len(training[(training['Fare']<9)&(training['Age']>49)]))

#10 people over the age of 49, which means the almost all low fare passengers are middle aged.

print(len(training[(training['Fare']<9)&(training['Age'].isnull())]))

#99 missing values in the group.  Since we've done this analysis we can fill these 99 ages more accurately than doing a generic calculation for all 177 missing age values.

Now we should determine if calculating a fare would be a good idea for the missing fare values.

In [None]:
combined=training.append(testing) #in order to find the number of tickets we need to use both training and testing data
training['PeopleInTicket']=training['Ticket'].map(combined['Ticket'].value_counts())
training['FarePerPerson']=training['Fare']/training['PeopleInTicket']

training['FamilyCount']=training['Parch']+training['SibSp']+1

pd.set_option("display.max_rows", None, "display.max_columns", None)
display(training.head())
print(len(training[training['FamilyCount'] != training['PeopleInTicket']]))

#There are 195 rows where family count doesn't match the number of people in the ticket.

In [None]:
print(len(training[(training['FarePerPerson']<7.1) & (training['FarePerPerson']>0) \
    & (training['Survived']==0)]))
print(len(training[(training['FarePerPerson']<7.1) & (training['FarePerPerson']>0) \
    & (training['Survived']==1)]))

this is a significant finding.  Despite people in the lowest ticket bracket having a 95% expiry rate groups of people had a much higher survival rate than individuals.  It will be helpful to look at things with regards to the group size they bring with them

In [None]:
training['GroupSize'] = training[['FamilyCount','PeopleInTicket']].max(axis=1)
plt.figure(figsize=(16, 6))
sns.countplot(x='FamilyCount', hue='Survived', data=training)

This demonstrates the low survival rate of solo travelers on the ship.  Another observation is that traveling with up to 4 is better, but once the group size hits 5 or more the survival rate drops off fully.

In [None]:
print('Between 2-4 familycount in Pclass 1,2: ', len(training[(training.FamilyCount \
        .between(2, 4)) & (training.Pclass.between(1,2))]))
print('Between 2-4 familycount in Pclass 3: ',len(training[(training.FamilyCount \
        .between(2, 4)) & (training.Pclass==3)]))
print('>4 familycount in Pclass 1,2: ',len(training[(training.FamilyCount>4) & \
        (training.Pclass.between(1,2))]))
print('>4 familycount in Pclass 3: ',len(training[(training.FamilyCount>4) & \
        (training.Pclass==3)]))

In [None]:
plt.figure(figsize=(16, 6))
sns.countplot(x='FamilyCount', hue='Survived', data=training[(training.FamilyCount > 1)])

plt.figure(figsize=(16, 6))
sns.countplot(x='FamilyCount',hue='Survived',data=training[(training.Pclass==3) \
        & (training.FamilyCount >1)])

This comparison shows the survival rates of groups in class 3.  This shows that besides being solo travelling in a group does not provide much more benefit to survival rates.  Just as a test we should check the survival rates of solo travelers vs the survival rates of genders

In [None]:
print('Mortality rate overall: ', round(len(training[(training.FamilyCount==1) & \
    (training.Survived!=1)]) / len(training[training.FamilyCount==1])*100), '%')

print('Mortality rate Male: ', round(len(training[(training.FamilyCount==1) & \
    (training.Survived!=1) & (training.Sex=='male')]) / len(training[(training\
    .FamilyCount==1)& (training.Sex=='male')])*100), '%')

print('Mortality rate Female: ', round(len(training[(training.FamilyCount==1) & \
    (training.Survived!=1) & (training.Sex=='female')]) / len(training[(\
    training.FamilyCount==1)& (training.Sex=='female')])*100), '%')


Since the survival rate of female travelers was already high it will be useful to graph the mortality rates between female solo travelers and group travelers.

In [None]:
plt.figure(figsize=(16, 6))
sns.countplot(x='FamilyCount',hue='Survived',data=training[(training.Sex=='female') \
    & (training.Pclass==3)])

Here we see an intersting find.  The mortality rate for solo females is much less than the mortality rate for non solo females.  Perhaps females with families were less likely to leave without them making solo females more likely to survive. 
Next we need to investigate the embarked and cabin columns.  These are unlikely to have real effects on survival rates.

In [None]:
#Column for cabin letter
training['CabinLetter'] = training['Cabin'].str[0]

#Display cabin info
print(training.groupby(['CabinLetter', 'Pclass', 'Sex'])['Survived'].agg(['count', 'mean']))

Cabins A, B, and C are all Pclass 1, D, E, and F contain some 2's and E, F, and G contain Pclass 3's.
Next comes the embarked column

In [None]:
print(training.groupby('Embarked')['Survived'].agg(['count','mean', 'size']))
#Port C has a high survival rate of 55% maybe less 3rd class individuals were on at this port

print(training.groupby(['Embarked', 'Pclass'])['Survived'].agg(['count','mean']))

#Since more first class passengers embarked at port C this explains the high survival rate of the port.

Finally we need to explore the name column.  The only real comparison I can do here is by name length.

In [None]:
#Create column of name lengths
training["NameLength"] = training["Name"].apply(len).astype(int)

plt.figure(figsize=(16, 6))
sns.countplot(x='NameLength', hue='Survived', data=training)

#What is intersting about this graph is that people with longer names tended to survive more often than they didn't. In order to explore this it might be helpful to see a few of the names with len > 35.

#print(training[training["NameLength"]> 35]) #A long output
#Many of these names are females with maiden names.  This is likely the reason for the name/survival relationship.  The males with long names are less likely to survive than the females in this category.

Having analyized all columns of the data we can draw a few useful conclusions.  Pclass 1 and 2 females almost always survive.  The next reliable indecator is groups.  But not all groups survive as shown earlier.  The key connection is that members within a team have correlated survival.  So it will be helpful to build groups of families on the titanic to use as a good indecator for members which are not female.  First I should fill in some missing values and then create the groupings which will be useful when training. 