In [1]:
import dask.dataframe as dd
import pandas as pd
from collections import OrderedDict

In [2]:
df = dd.read_csv("data/titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
def get_count(data, feat_name):
    # Eureka! I must use compute() to not get back a weird Scalar object!
    return data[feat_name].count().compute()

In [5]:
# No missing values for this feature
get_count(df, "Survived")

891

In [6]:
# Lots of missing values for this feature
get_count(df, "Cabin")

204

In [7]:
"""
This follows a similar approach...get a series object where 1=null, 
0= not null, then sum the series to get the number of null values
"""
def get_count_missing(data, feat_name):
    return data[feat_name].isnull().sum().compute()

In [8]:
get_count_missing(df, "Survived")

0

In [9]:
get_count_missing(df, "Cabin")

687

In [10]:
# Get the number missing and divide by the number of records
def get_percent_missing(data, feat_name):
    missing_count = get_count_missing(data, feat_name)
    missing_percent = 100 * missing_count / float(len(data))
    return missing_percent

In [11]:
get_percent_missing(df, "Cabin")

77.104377104377107

In [12]:
# Confirm this matches the expected result
100*687/891

77.10437710437711

In [13]:
# No change vs pandas
def feat_is_numeric(data, feat_name):
    return data[feat_name].dtype in ['int64', 'float64']

In [14]:
feat_is_numeric(df, "Survived")

True

In [15]:
feat_is_numeric(df, "Embarked")

False

In [20]:
def get_average(data, feat_name):
    if feat_is_numeric(data, feat_name):
        return float(data[feat_name].mean().compute())
    return None

In [21]:
get_average(df, "Survived")

0.3838383838383838

In [22]:
# Should return nothing since Embarked is not numeric
get_average(df, "Embarked")