In [None]:
# Let's start with some standard imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [None]:
# Downloading and unzipping dataset
# Only need to run once

#!cd data/ && kaggle competitions download -c titanic
#!cd data/ && unzip titanic.zip

In [None]:
# Let's load the data and take a first look
df = pd.read_csv("data/train.csv")
df.head()

In [None]:
# Our primary cross validation method will be K-fold CV
# Since our dataset and models will be small, we'll have resources to train several models to estimate model variance
# Because of this, we won't have an individual val set - we're safe to do EDA on the whole given dataset

In [None]:
# Let's look at a summary of the data
df.describe()

In [None]:
# Get a list of features
df.columns

In [None]:
# Get total number of passengers
N = len(df)

In [None]:
# First, let's look at passenger id
# It appears like it might just be a linear increment
print(df.PassengerId.head(100))

# Let's confirm
for n in range(N):
    assert((n+1) == df.PassengerId[n])
# Looks like we're good to go

In [None]:
# Next, let's look at the Survived column
# This is our y - the value we're trying to predict
# Let's confirm we're only getting binary values
assert([0,1] in df.Survived.unique())
survived_cnts = df.Survived.value_counts(dropna=False)
print(survived_cnts)
survived_cnts.plot(kind='bar')
# Less than 40% of people in this dataset survived

In [None]:
# Next - Pclass, which is the ticket class
# Looks like it's a ternary feature
# When we start doing preprocessing, we need to one-hot encode this
pclass_cnts = df.Pclass.value_counts(dropna=False)
print(pclass_cnts)
pclass_cnts.plot(kind='bar')

In [None]:
# Next, let's look at names
# Confirmed all names are unique
# I will probably not dive too deep into text features until later
assert(df.Name.is_unique)

# Let's get the most words (first names, last names, prefix, suffix, etc.)
from collections import Counter
words = " ".join(df.Name)
name_word_cnts = Counter(words.split())
print(name_word_cnts.most_common(20))

In [None]:
# Let's create a word cloud with all of the words in the names column
# from wordcloud import WordCloud, STOPWORDS
# wordcloud = WordCloud(width= 3000, height = 2000, random_state=1, background_color='#73B8E9', colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(words)
# plt.figure(figsize=(12, 8))
# plt.imshow(wordcloud, interpolation="bilinear")
# plt.axis('off')
# plt.show()

In [None]:
# Next is Sex
# Let's confirm if it's binary
sex_cnts = df.Sex.value_counts(dropna=False)
sex_cnts.plot(kind='bar')
# We'll encode these into a binary 0/1 feature later

In [None]:
age_cnts = df.Age.value_counts()
print(age_cnts)
# Interestingly, we have fractional ages here
df.Age.hist()
# May be a skewed normal distribution?
# May want to create feature bins for age groups (baby, child, adult, etc.)

In [None]:
# Next is number of siblings / spouses on-board
sibsp_cnts = df.SibSp.value_counts()
print(sibsp_cnts)
df.SibSp.hist()
# Could split this into a binary feature - i.e. are you in a family on the Titanic?
# Another thing to note - we seem to have five folks in a family of 5 (are they in the same family?) and 7 folks in a family of 8.
# But one of the eight isn't on the list?
# Maybe it's in the test set
# This feature MAY cause overfitting since we're going to edge of the range for this feature

In [None]:
# Next is num parents / children on Titanic
print(df.Parch.value_counts())
df.Parch.hist()
# Most folks don't have parents or children
# The most common non-zero value is 1
# Could split this up into categorical features? 
# Maybe just 0, 1, 2, and 2+?

In [None]:
# Next is ticket id
# This seems more like a string?
# Let's extract the most common words
df.Ticket.head()
ticket_words_list = " ".join(df.Ticket).split()
ticket_word_cnts = Counter(ticket_words_list)
print(ticket_word_cnts.most_common(40))
# Surpringly some of these are words, but some are numbers too - what's the relation? Do families share ticket numbers?
# Let's grab the numbers and make a histogram
ticket_nums_series = pd.Series([int(n) for n in ticket_words_list if n.isnumeric()])
plt.xscale('log')
ticket_nums_series.hist()
print(ticket_nums_series.describe())
# Looks to mostly look like a linear increment with some values missing

In [None]:
# Next let's look at fare
print(df.Fare.describe())
df.Fare.hist()
# Looks like some people got in for free!
print(df.Fare.value_counts())
# Might be good to bin these values - there are defintely some standard price values

In [None]:
# Almost done with univariate EDA - let's look at cabin numbers next
print(df.Cabin.value_counts())
print(df.Cabin.describe())
# What are these letters? Floor numbers? Could be correlated with class
# Looks like we also have some missing values in here
print(f"Number of non-null values: {len(df.Cabin.dropna())} / {len(df.Cabin)}")
# We can probably include an is nan feature for missing cabin numbers
text = " ".join(df.Cabin.dropna())
Counter(c for c in text.lower() if c.isalpha())

In [None]:
# And lastly - embarked, which encodes where the port the person boarded on
embarked_cnts = df.Embarked.value_counts()
embarked_cnts.plot(kind='bar')

In [None]:
# Next let's start looking at correlations between pairs of variables

# PassengerId, Survived, Pclass, Name, Sex, Age, SibSp,Parch, Ticket, Fare, Cabin, Embarked
# Since Survived is the variable to be predicted, let's first correlate survived with all of the other variables

# 1. Survived, PassengerId
no_survived_df = df[df.Survived == 0]
survived_df = df[df.Survived == 1]

plt.boxplot([no_survived_df.PassengerId, survived_df.PassengerId], labels=['0','1'])
plt.ylabel("PassengerId")
plt.xlabel("Survived")
df[['PassengerId', 'Survived']].corr()

# There is not much correlation at all between passenger id and survived

In [None]:
# 2. Survived, Pclass

plt.boxplot([no_survived_df.Pclass, survived_df.Pclass], labels=['0','1'])
plt.ylabel("Pclass")
plt.xlabel("Survived")
print(df[['Pclass', 'Survived']].corr())

# If we maintain ordering for pclass, we see a weak correlation
# It is negative - meaning higher pclass has lower chance of surviving
# While the survivors seem to be pretty unofrmly distributed between each class, those who perished are often in the lower classes

df[['Pclass', 'Survived']].groupby('Survived').value_counts()
# When we look at counts for pairs of values, 3rd class is disproportionally represented by 3rd class passengers

In [None]:
# 3. Survived, Name

In [None]:
# 4. Survived, Sex
is_male_series = df.Sex.apply(lambda x: 0 if x == 'female' else 1)
print(is_male_series.corr(df.Survived))
# We have a moderate association between sex and survival
# It is negative - meaning males had a lower chance of survival

# Exact values of pairs
pd.concat([is_male_series, df.Survived], axis=1).groupby("Survived").value_counts()

# Percentage of total in each pair
pd.concat([is_male_series, df.Survived], axis=1).groupby("Survived").value_counts().apply(lambda x: x / len(df))
# Over half of the dataset is males who died

In [None]:
# 5. Survived, Age
print(df[['Age', 'Survived']].dropna().corr())
plt.boxplot([no_survived_df.Age.dropna(), survived_df.Age.dropna()], labels=['0','1'])
# Broadly speaking, there's no association with age and survival
# This is surprising - I would've assumed children would've had a higher chance of survival
# Also that older folks would've had a lower chance of survival - we can break this up more later when we bin ages

In [None]:
# 6. Survived, SibSp
print(df[['SibSp', 'Survived']].dropna().corr())
plt.boxplot([no_survived_df.SibSp.dropna(), survived_df.SibSp.dropna()], labels=['0','1'])
# Largely there's no correlation if you look at the bulk of the data
# When we look at outliers, the bigger families tended to not survive
# Maybe an interaction term between SibSp and 'isOutlier' could be useful?
# I think it might overfit though and compromise average performance

In [None]:
# 7. Survived, Parch
print(df[['Parch', 'Survived']].dropna().corr())
plt.boxplot([no_survived_df.Parch.dropna(), survived_df.Parch.dropna()], labels=['0','1'])
# It seems like not survivng is concentrated at 0
# Let's group it 
df[['Parch', 'Survived']].dropna().groupby('Survived').value_counts()
# Those without parents or siblings were more likely to not survive. Maybe it's correlated with being an adult male?

In [None]:
# 8. Survived, Ticket

In [None]:
# 9. Survived, Fare

In [None]:
# 10. Survived, Cabin

In [None]:
# 11. Survived, Embarked

In [None]:
# 12. PassengerId, Pclass

In [None]:
# 13. PassengerId, Name

In [None]:
# 14. PassengerId, Sex

In [None]:
# 15. PassengerId, Age

In [None]:
# 16. PassengerId, SibSp

In [None]:
# 17. PassengerId, Parch

In [None]:
# 18. PassengerId, Ticket

In [None]:
# 19. PassengerId, Fare

In [None]:
# 20. PassengerId, Cabin

In [None]:
# 21. PassengerId, Embarked

In [None]:
# 22. Pclass, Name

In [None]:
# 23. Pclass, Sex

In [None]:
# 24. Pclass, Age

In [None]:
# 25. Pclass, SibSp

In [None]:
# 26. Pclass, Parch

In [None]:
# 27. Pclass, Ticket

In [None]:
# 28. Pclass, Fare

In [None]:
# 29. Pclass, Cabin

In [None]:
# 30. Pclass, Embarked

In [None]:
# 31. Name, Sex

In [None]:
# 32. Name, Age

In [None]:
# 33. Name, SibSp

In [None]:
# 34. Name, Parch

In [None]:
# 35. Name, Ticket

In [None]:
# 36. Name, Fare

In [None]:
# 37. Name, Cabin

In [None]:
# 38. Name, Embarked

In [None]:
# 39. Sex, Age

In [None]:
# 40. Sex, SibSp

In [None]:
# 41. Sex, Parch

In [None]:
# 42. Sex, Ticket

In [None]:
# 43. Sex, Fare

In [None]:
# 44. Sex, Cabin

In [None]:
# 45. Sex, Embarked

In [None]:
# 46. Age, SibSp

In [None]:
# 47. Age, Parch

In [None]:
# 48. Age, Ticket

In [None]:
# 49. Age, Fare

In [None]:
# 50. Age, Cabin

In [None]:
# 51. Age, Embarked

In [None]:
# 52. SibSp, Parch

In [None]:
# 53. SibSp, Ticket

In [None]:
# 54. SibSp, Fare

In [None]:
# 55. SibSp, Cabin

In [None]:
# 56. SibSp, Embarked

In [None]:
# 57. Parch, Ticket

In [None]:
# 58. Parch, Fare

In [None]:
# 59. Parch, Cabin

In [None]:
# 60. Parch, Embarked

In [None]:
# 61. Ticket, Fare

In [None]:
# 62. Ticket, Cabin

In [None]:
# 63. Ticket, Embarked

In [None]:
# 64. Fare, Cabin

In [None]:
# 65. Fare, Embarked

In [None]:
# 66. Cabin, Embarked