In [None]:
import pandas as pd
import numpy as np
import bayespy as bp
from collections import OrderedDict
from pathlib import Path


ATTR_SPEC_PATH = 'data/attr.txt'


def parse_attr_spec_line(line: str) -> (str, str):
    key, desc = line.split(':', maxsplit=1)
    _, name = key.split('--')
    return name.strip(), desc.strip()


def read_attr_spec(path=ATTR_SPEC_PATH) -> OrderedDict:
    path = Path(path)
    with path.open() as file:
        attr_spec = OrderedDict(parse_attr_spec_line(line)
                                for line in file
                                if '--' in line)
    return attr_spec


def quantify(data: pd.Series) -> pd.Series:
    std = data.std()
    mean = data.mean()
    bins = np.array([-np.inf, -1.5 * std, -0.5 * std, 0.5 * std, 1.5 * std, np.inf]) + mean
    return pd.cut(data, bins, labels=[-2, -1, 0, 1, 2])

In [None]:
# Load all data
original_data = pd.DataFrame.from_csv('data/communities.data', header=None, index_col=None)
attr_spec = read_attr_spec()
original_data.columns = list(attr_spec.keys())

In [None]:
# Filter interesting states
STATES = {
    'Indiana': 18,
    'Ohio': 39,
    'West Virginia': 54,
    'Virginia': 51,
    'Kentucky': 21,
    'Tennessee': 47
}
data = original_data[original_data['state'].isin(STATES.values())]

In [None]:
means = data.mean()
stds = data.std()
selected_attrs = [
    'LandArea',
    'numbUrban',
    'medIncome',
    'agePct12t21',
    'NumUnderPov',
    'PctUnemployed',
    'ViolentCrimesPerPop'
]
ex_data = data[selected_attrs]

In [None]:
ex_data = ex_data.apply(quantify)

In [None]:
ex_data