# Chapter 2 - Exploratory Data Analysis
## Worked examples

In [31]:
import os
import pandas as pd
import numpy as np
from scipy.stats import trim_mean

In [32]:
# initialize environment
ROOT_DIR = os.path.join(os.getcwd(), '../..')
DATA_DIR = os.path.join(ROOT_DIR, 'data')

In [33]:
# load data
df = pd.read_csv(os.path.join(DATA_DIR, 'state.csv'))

## Example: Location estimates of Population and Murder Rates

### Take the mean of a column

In [34]:
print('Population mean = {}'.format(df['Population'].mean()))

Population mean = 6162876.3


### Take the trimmed mean
Note that this is not the same as windsorizing.

In [35]:
print('Population trimmed mean = {}'
      .format(trim_mean(df['Population'], proportiontocut=0.1)))

Population trimmed mean = 4783697.125


# Median of a column

In [36]:
print('Population median = {}'.format(df['Population'].median()))

Population median = 4436369.5


### Unweighted mean

In [37]:
print('Unweighted mean = {}'.format(df['Murder.Rate'].mean()))

Unweighted mean = 4.066


### Weighted mean
This will be a little bit more involved since there are no vanilla (pandas) weighted mean functions. However, this is no issue as we can create a short function that does this for us.

It is worth remembering the motivations of taking weighted means:

- Some values are intrinsically more variable than others, and highly variable observations are given a lower weight. (multiple sensors, one unaccurate sensor, downweight it)
- The data collected does not equally represent the different groups that we are interested in measuring. (underrepresented groups get a higher weight, in relationship to the overall population)

In [39]:
def wmean(group_col, mean_col, weight_col, data=df):
    """
    Take the mean of mean_col and apply the weights from group_col,
    and weight_col and returns a weighted mean.
    """

    # calculate weights
    weights = \
        data.groupby(group_col)[weight_col].sum() / data[weight_col].sum()

    # calculate means
    means = \
        data.groupby(group_col)[mean_col].mean()

    # concatenate to make sure weight_i is with mean_i
    x = pd.concat([weights, means], axis=1)

    # muliply weights * means, and divide over the sum of the weights.
    w_mean = x.prod(axis=1).sum() / weights.sum()

    return w_mean


print('Weighted mean = {}'
      .format(wmean(group_col='State',
                    mean_col='Murder.Rate',
                    weight_col='Population',
                    data=df)))

Weighted mean = 4.445833981123392
