In [12]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Explore & Visualize the data

## Loading the training set

In [2]:
import os

import logging
logger = logging.getLogger()
logging.basicConfig(
    level=logging.INFO,
    format='%(levelname)s:%(message)s'
)

In [3]:
from titanic2 import TitanicETL

# Study attributes and their characteristics

- dtypes
- NaNs
- Distributions
- etc.

## Potentially useful features

- has cabin
- in cabin shared by X passengers
- One-hot by cabin letter

In [4]:
data = TitanicETL('data/copy_custom_train.csv')

In [5]:
df = data.get()

# Describe

In [13]:
df.describe(include='all')

Unnamed: 0,Survived,Pclass,Fare,HasCabin,C0,CA,CB,CC,CD,CE,CF,CG,C,Q,S,NSex
count,713.0,713.0,713.0,713,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0
unique,,,,2,,,,,,,,,,,,
top,,,,False,,,,,,,,,,,,
freq,,,,548,,,,,,,,,,,,
mean,0.389902,2.30014,33.813054,,0.768583,0.019635,0.056101,0.064516,0.036466,0.035063,0.014025,0.00561,0.197756,0.088359,0.712482,0.638149
std,0.48807,0.841155,53.941465,,0.422034,0.138841,0.230278,0.245843,0.187577,0.184068,0.117677,0.074743,0.398587,0.284016,0.452923,0.480873
min,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,7.8958,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,14.5,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,1.0,3.0,30.6958,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


# Exploration

## Embarked

We've got a null. If we decide to use Embarked in the ML we'll get rid of this one. Otherwise we keep her.

Let's turn this into a one hot and get rid of the original column.

## Age

Let's get rid of it? There are too many `NaNs`.

## Cabins

Is there a correlation with price?

Let's turn this into a boolean column as well as a one hot encoded matrix. Ok, done.

## Sex

We'll have to turn the `male`/`female` values into ones and zeroes. In this case Male=1, Female=0.

## Name, Ticket, SibSp and Parch

These are too unique to really tell me anything. I'm going to drop them.

# Visualize the data

[Scatter Matrix](https://github.com/pandas-dev/pandas/blob/2fdf1e256e5e0b7f1fe909629e2f0b7893c8c7c3/pandas/plotting/_misc.py#L17)

In [7]:
from pandas.plotting import scatter_matrix

In [11]:
fig = plt.figure(figsize=(30,20))
ax = fig.add_axes([1,1,1,1])
plt.plot([1,2])

%time scatter_matrix(df, ax=ax)

plt.savefig('test.png')

  out = eval(code, glob, local_ns)


CPU times: user 8.62 s, sys: 48 ms, total: 8.67 s
Wall time: 8.67 s


# Appendix A: Stuff I've learned here

1. In addition to `DataFrame.isnull()` we also have `notna()` which returns the inverse.
1. To use a column as the index use `DataFrame.set_index('Column')`.
1. Scikit-Learn has a `OneHotEncoder()` but Pandas has `pd.get_dummies(data)`.

# Appendix B: Annotations

**Cabin letters** have been turned into a one hot encoded matrix where `C0` means the passenger did not have a cabin. `CX` then means the passenger was in a cabin starting with the letter `X`.

**Embarked letters** have been turned into a one hot encoded matrix. Port of Embarkation: C = Cherbourg, Q = Queenstown, S = Southampton.

# Appendix C: Available columns

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713 entries, 1 to 891
Data columns (total 16 columns):
Survived    713 non-null int64
Pclass      713 non-null int64
Fare        713 non-null float64
HasCabin    713 non-null bool
C0          713 non-null uint8
CA          713 non-null uint8
CB          713 non-null uint8
CC          713 non-null uint8
CD          713 non-null uint8
CE          713 non-null uint8
CF          713 non-null uint8
CG          713 non-null uint8
C           713 non-null uint8
Q           713 non-null uint8
S           713 non-null uint8
NSex        713 non-null int8
dtypes: bool(1), float64(1), int64(2), int8(1), uint8(11)
memory usage: 51.3 KB


In [10]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Fare,HasCabin,C0,CA,CB,CC,CD,CE,CF,CG,C,Q,S,NSex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,7.25,False,1,0,0,0,0,0,0,0,0,0,1,1
3,1,3,7.925,False,1,0,0,0,0,0,0,0,0,0,1,0
4,1,1,53.1,True,0,0,0,1,0,0,0,0,0,0,1,0
5,0,3,8.05,False,1,0,0,0,0,0,0,0,0,0,1,1
6,0,3,8.4583,False,1,0,0,0,0,0,0,0,0,1,0,1
