In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np


import glob
import os

import models

import pickle

from tqdm import tqdm

from census import Census
from us import states

from collections import Counter

%matplotlib inline

DATA_DIR = os.path.join('data')
DATA_DIR

'data'

In [3]:
data = pd.read_csv(os.path.join(DATA_DIR, 'raw', 'tedsa_00_16_puf.csv'), dtype=np.int64)

In [6]:
data = data[data.CBSA10 != -9]

In [4]:
len(data.CBSA10.unique())

811

In [8]:
drop_states = [2, 60, 66, 15, 72, 78]
data = data[~data.STFIPS.isin(drop_states)]

In [9]:
data.shape

(25742826, 62)

# Cleaning Data
- All Data: 31,406,891 data points.

## Cleaning Process
1. Drop all where CBSA10 == -9 (no geographic information, nothing to predict). Remaining data points: 25,940,650

``data = data[data.CBSA10 != -9]``

2. Drop all non-contiguous USA states/territories (STFIPS: 02 Alaska, 60 Samoa, 66 Guam, 15 Hawaii, 72 Puerto Rico, 78 Virgin Islands). Remaining data points: 25,742,826

``
drop_states = [2, 60, 66, 15, 72, 78]
data = data[~data.STFIPS.isin(drop_states)]
``

3. Drop counties that don't occur in all (17) years. Remaining data points: 22,612,190


3. What columns to drop for the patient vector? (CASEID, YEAR)
- Want to keep the county (CBSA10)

4. What columns to drop for the county-mean vector? (CASEID, YEAR, CBSA10, STFIPS)

In [5]:
drop_states = [2, 60, 66, 15, 72, 78]
data = data[~data.STFIPS.isin(drop_states)]

# Generate Labels
- Labels are #of admissions in a county for a given year.
- Labels stored in dictionary, accessed with tuple (CBSA10 code, YEAR)
- NOTE: May want to normalize based on population. 

In [10]:
#admission counts for CBSA for each year. Accessed with (CBSA, YEAR)

years = sorted(data.YEAR.unique())
cbsa_year_counts = {} 

for year in years:
    counts = data[data.YEAR == year].CBSA10.value_counts()
    for cbsa, count in counts.iteritems():
        cbsa_year_counts[(cbsa,year)]=count
  

In [11]:

l = []
for k in cbsa_year_counts.keys():
    l.append(k[0])
    
cbsa_counts = Counter(l)

total_counts = data.CBSA10.value_counts()

drop_cbsas = []
for c in cbsa_counts:
    if cbsa_counts[c]!=17:
        drop_cbsas.append(c)  

In [13]:
data = data[~data.CBSA10.isin(drop_cbsas)]

In [15]:
data.to_csv(os.path.join(DATA_DIR, 'cleaned', 'teds-cleaned.csv'),index=False)

In [21]:
# Auxillary Mean Features
mean_drop_cols = ['CASEID', 'YEAR', 'CBSA10']
cbsa_year_means = {}
for year in years:
    year_df = data[data.YEAR == year]
    cbsas = year_df.CBSA10.unique()
    for cbsa in cbsas:
        means = year_df[year_df.CBSA10==cbsa].mean().drop(mean_drop_cols)
        cbsa_year_means[(cbsa,year)]=np.array(means)

### Precompute and Save Admission Counts and Feature Means to Pickle files

In [22]:
with open('data/cleaned/cbsa-means.pkl', 'wb') as output:
    pickle.dump(cbsa_year_means, output)
    
with open('data/cleaned/admission-counts.pkl', 'wb') as output:
    pickle.dump(cbsa_year_counts, output)
    

In [49]:
cbsa_year_means = {}
with open('test.pkl', 'rb') as inp:
    cbsa_year_means = pickle.load(inp)
    
    

# Build Sequence Vectors
- Sequences are constructed as follows: For each patient admission entry, we construct N vectors that are the admission entry concatenated with the county mean features for 1 of N years. Sequences are these N vectors ordered by year. 
- Labels are admission counts for the corresponding county in the patient entry.

- Baseline model could just use sequences of county features of each year. 

In [301]:

patient_drop_cols = ['CASEID', 'YEAR']
def feature_vector(pandas_row, year):
    cb_year = (row.CBSA10, year)
    patient_vec = np.array(row.drop(patient_drop_cols))
    means_vec = cbsa_year_means[cb_year]
    
#     feat_vec = np.concatenate((patient_vec, means_vec))
    feat_vec = means_vec
#     feat_vec = np.concatenate(([cb_year[0], cb_year[1]], means_vec))
    
    
    label = cbsa_year_counts[cb_year]
    return feat_vec, label


test_df = data.sample(n=5000)
X,Y=[],[]
for idx, row in test_df.iterrows():
    seq = []
    for year in years[:-1]:
        x,y = feature_vector(row, year)
        seq.append(x)
    
    X.append(np.array(seq))
    x,y = feature_vector(row, years[-1])
    l = 0
    if y > 10:
        l = 1
    Y.append(l)
    
    
    
X = np.array(X)
Y = np.array(Y)
# Y = Y/np.max(Y)
Y = keras.utils.to_categorical(Y)

In [None]:
# Pull Population data and create dict: CBSA -> Population (Year: 2010)

In [23]:
def fix_fips(state, county):
    state = str(state)
    county = str(county)
    if len(state)<2:
        state = "0"*(2-len(state))+state
    if len(county)<3:
        county = "0"*(3-len(county))+county
    
    return state + county
        


In [None]:
# Load in CBSA > FIPS Translation values for plotting purposes (and geolocation)
cbsa_fips = pd.read_excel(os.path.join(DATA_DIR, 'raw', "CBSA-FIPS.xls"), header=2,nrows=1882,convert_float=False, dtype='object')


#CBSA -> [FIPS]
cbsa2fips = {}
all_cbsas = data.CBSA10.unique()
for idx,row in cbsa_fips.iterrows():
    cb = int(row['CBSA Code'])
    if cb in all_cbsas:
        fips = fix_fips(row['FIPS State Code'], row['FIPS County Code'])
        if not (cb in cbsa2fips):
            cbsa2fips[cb] = [fips]
        else:
            cbsa2fips[cb].append(fips)
            
            
    

In [None]:
cbsa2fips

In [25]:

c = Census("846367a9aca0d09de354714a8eb90669e7ec5837",year=2010)
pop_results = c.sf1.state_county('P001001', Census.ALL,Census.ALL)

In [26]:
fips_pops = {}
for p in pop_results:
    fips = p['state']+p['county']
    pop = int(p['P001001'])
    fips_pops[fips]=pop
    
cbsa_pops = {}
for cb in cbsa2fips:
    fips = cbsa2fips[cb]
    s = 0
    for f in fips:
        s += fips_pops[f]
    cbsa_pops[cb]=s  
    
pops = pd.DataFrame({'CBSA': list(cbsa_pops.keys()), 'Pop': list(cbsa_pops.values())})
pops.to_csv('data/cleaned/cbsa-populations-2010.csv',index=False)

In [None]:
with open('data/raw/county_adjacency.txt', 'rb') as f:
    lines = [line.decode('utf-8', errors='replace').strip().split('\t') for line in f.readlines()]

neighbors = []
current = None
for line in lines:
    if len(line) == 4:
        current = line[1]
        try:
            pair = (current, line[3])
        except IndexError:
            print(line)
    elif len(line) == 2:
        pair = (current, line[1])
    elif len(line) == 3:
        current = line[0]
        pair = (current, line[2])
    
    alt_pair = (pair[1], pair[0])
    
    if (pair not in neighbors) and (alt_pair not in neighbors) and (pair[0] != pair[1]): 
        neighbors.append(pair)