In [None]:
import pandas as pd
liver = pd.read_csv('https://hds5210-data.s3.amazonaws.com/indian_liver_patient.csv')

In [None]:
liver.head()

## Description

This is a data set about people from India who were in a liver disease project.  There are several pieces of information about the individuals as well as a flag in the last column called `Dataset`.  In this column a `2` means the individual did **not** have liver disease.  A `1` indicates that the individual **did** have liver disease.

We wan to run some basic statistics on the Age, Gender, and Total Bilirubin BY if they had liver disease or not

The first step is to group by the Dataset column


In [None]:
liver = liver.groupby('Dataset')

In [None]:
type(liver)

Then we can calculate som metrics on a few columns...

In [None]:
liver["Age"].mean()

In [None]:
liver["Age"].agg(['min','max','mean','median','std','count'])

In [None]:
liver["Age"].hist()

Let's look at Bilirubin measure, too...

In [None]:
liver["Total_Bilirubin"].agg(['min','max','mean','std','count'])

In [None]:
liver.agg({'Total_Bilirubin': ['min','max','mean','std'], 'Direct_Bilirubin': ['mean','std']})

In [None]:
liver["Total_Bilirubin"].hist()

# Custom Aggregation Function

It can also be handy to be able to create custom aggregation functions.  For our example, we're going to create a `percent_male` function that returns the percent of items in each group that are male.

In [None]:
def percent_male(series):
    total = len(series)
    male = sum(series == 'Male')
    return male/total

In [None]:
liver.agg({'Gender': percent_male})

## Custom Aggregation with `apply`

In [None]:
def percent_val(df, column='Gender', value='Male'):
    # Compute the percent of rows where *column* is *value*
    total = len(df)
    part = sum(df[column].astype(str) == str(value))
    #part = sum(df['Gender'] == 'Male')
    
    # Return this is a single valued Series
    return pd.Series([part/total], index=[str(value)+'%'])

In [None]:
liver.apply(percent_val)

In [None]:
liver.apply(percent_val, column='Gender', value='Female')

In [None]:
liver.apply(percent_val, column='Age', value=62)

In [None]:
def percent_range(df, column='Age', min_value=-1, max_value=999):
    # Compute the percent of rows where *column* is *value*
    total = len(df)
    part = sum((df[column].astype(float) >= float(min_value)) & (df[column].astype(float) <= float(max_value)))
    
    # Return this is a single valued Series
    return pd.Series([part, part/total], 
                     index=[str(min_value)+' to '+str(max_value),str(min_value)+' to '+str(max_value)+'%'])

In [None]:
liver.apply(percent_range, column='Age', min_value=0, max_value=18)

In [None]:
liver.apply(percent_range, column='Total_Bilirubin', min_value=0, max_value=10)