In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Explore & Visualize the data

## Loading the training set

In [2]:
import os

import logging
logger = logging.getLogger()
logging.basicConfig(
    level=logging.DEBUG,
#     filename='foo.log',
#     filemode='w',
#     format='%(asctime)s:%(message)',
#     datefmt='%m/%d/%Y',
    format='%(levelname)s:%(message)s'
)

In [3]:
class TitanicETL:
    '''This is the ETL class I'll use for the rest of this project.
    It will be used to load the data and make necessary transformations.
    '''
    def __init__(self, location=''):
        if not location:
            logger.fatal('Please specify a file.')
            return None
        try:
            self.location = os.path.abspath(location)
        except:
            logger.exception('Could not load file.')
        return
    
    def _load_from_file(self):
        df = pd.read_csv(self.location)
        
        # Index
        df = df.set_index('PassengerId')
        
        # Drop columns
        df = df.drop('Age', axis=1)
        df = df.drop('Unnamed: 0', axis=1)
        
        return df
    
    def _add_columns(self, df):
        df['HasCabin'] = df['Cabin'].notna()
        return df
    
    def get(self):
        return (
            self._load_from_file()
            .pipe(self._add_columns)
        )

# Study attributes and their characteristics

- dtypes
- NaNs
- Distributions
- etc.

## Potentially useful features

- has cabin
- in cabin shared by X passengers

In [4]:
data = TitanicETL('data/copy_custom_train.csv')

In [5]:
df = data.get()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713 entries, 1 to 891
Data columns (total 11 columns):
Survived    713 non-null int64
Pclass      713 non-null int64
Name        713 non-null object
Sex         713 non-null object
SibSp       713 non-null int64
Parch       713 non-null int64
Ticket      713 non-null object
Fare        713 non-null float64
Cabin       165 non-null object
Embarked    712 non-null object
HasCabin    713 non-null bool
dtypes: bool(1), float64(1), int64(4), object(5)
memory usage: 62.0+ KB


# Describe

In [8]:
df.describe(include='all')

Unnamed: 0,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabin
count,713.0,713.0,713,713,713.0,713.0,713.0,713.0,165,712,713
unique,,,713,2,,,574.0,,127,3,2
top,,,"Skoog, Miss. Mabel",male,,,1601.0,,C23 C25 C27,S,False
freq,,,1,455,,,6.0,,4,508,548
mean,0.389902,2.30014,,,0.476858,0.382889,,33.813054,,,
std,0.48807,0.841155,,,0.965608,0.805939,,53.941465,,,
min,0.0,1.0,,,0.0,0.0,,0.0,,,
25%,0.0,2.0,,,0.0,0.0,,7.8958,,,
50%,0.0,3.0,,,0.0,0.0,,14.5,,,
75%,1.0,3.0,,,1.0,0.0,,30.6958,,,


# Exploration

## Embarked

We've got a null. If we decide to use Embarked in the ML we'll get rid of this one. Otherwise we keep her.

In [9]:
df[df['Embarked'].isnull()]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
62,1,1,"Icard, Miss. Amelie",female,0,0,113572,80.0,B28,,True


## Age

Let's get rid of it? There are too many `NaNs`.

## Cabins

Is there a correlation with price?

Maybe split the letter part and the numeric part and aggregate them somehow?

In [10]:
cabins = df['Cabin'].unique()[1:]
cabins.sort()

In [11]:
print(len(cabins))
cabins

127


array(['A10', 'A14', 'A16', 'A20', 'A23', 'A24', 'A26', 'A31', 'A32',
       'A34', 'A36', 'A5', 'A6', 'A7', 'B101', 'B102', 'B18', 'B19', 'B22',
       'B28', 'B3', 'B35', 'B37', 'B38', 'B39', 'B4', 'B41', 'B42', 'B49',
       'B5', 'B50', 'B51 B53 B55', 'B57 B59 B63 B66', 'B58 B60', 'B69',
       'B71', 'B73', 'B77', 'B78', 'B79', 'B80', 'B82 B84', 'B94',
       'B96 B98', 'C103', 'C104', 'C106', 'C111', 'C118', 'C123', 'C124',
       'C125', 'C126', 'C128', 'C148', 'C2', 'C22 C26', 'C23 C25 C27',
       'C30', 'C32', 'C45', 'C46', 'C49', 'C50', 'C52', 'C54', 'C62 C64',
       'C65', 'C68', 'C7', 'C78', 'C82', 'C83', 'C87', 'C91', 'C92', 'C93',
       'C95', 'C99', 'D', 'D10 D12', 'D11', 'D15', 'D17', 'D20', 'D21',
       'D26', 'D28', 'D35', 'D36', 'D37', 'D45', 'D47', 'D48', 'D49',
       'D50', 'D56', 'D6', 'D7', 'D9', 'E101', 'E12', 'E121', 'E24', 'E25',
       'E31', 'E33', 'E34', 'E36', 'E38', 'E40', 'E44', 'E46', 'E49',
       'E50', 'E58', 'E63', 'E67', 'E77', 'E8', 'F G63', 

# Appendix A: Stuff I've learned here

1. In addition to `DataFrame.isnull()` we also have `notna()` which returns the inverse.
1. To use a column as the index use `DataFrame.set_index('Column')`.