In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(2340959)

import seaborn as sns
sns.set_style('whitegrid')

In [2]:
file = "../../FoodEnvironmentAtlas.xls"
data = pd.ExcelFile(file)
sheet_dict = {sheet_name: data.parse(sheet_name) for sheet_name in data.sheet_names}

In [3]:
county_census_data = pd.read_csv('county_census_data.csv')
closest_neighboring_counties = pd.read_csv('closest_neighboring_counties.csv')
county_centroid_distances = pd.read_csv('county_centroid_distances.csv')

In [60]:
county_centroid_distances

Unnamed: 0,FIPS,County,DISTANCE_TO_1001,DISTANCE_TO_1003,DISTANCE_TO_1005,DISTANCE_TO_1007,DISTANCE_TO_1009,DISTANCE_TO_1011,DISTANCE_TO_1013,DISTANCE_TO_1015,...,DISTANCE_TO_56027,DISTANCE_TO_56029,DISTANCE_TO_56031,DISTANCE_TO_56033,DISTANCE_TO_56035,DISTANCE_TO_56037,DISTANCE_TO_56039,DISTANCE_TO_56041,DISTANCE_TO_56043,DISTANCE_TO_56045
0,1001,Autauga,0.0,13160.0,6502.0,3034.0,9174.0,4385.0,5002.0,7193.0,...,13946.0,8587.0,18423.0,5061.0,16959.0,12418.0,8580.0,13509.0,12210.0,9127.0
1,1003,Baldwin,13160.0,0.0,15470.0,14979.0,14176.0,14802.0,9324.0,12002.0,...,7974.0,18281.0,5308.0,10042.0,7429.0,3826.0,13523.0,4680.0,7274.0,11442.0
2,1005,Barbour,6502.0,15470.0,0.0,7083.0,10377.0,2220.0,7449.0,11748.0,...,8839.0,5295.0,14201.0,11488.0,11011.0,11715.0,2488.0,19877.0,17230.0,4239.0
3,1007,Bibb,3034.0,14979.0,7083.0,0.0,6154.0,5670.0,8036.0,4954.0,...,15877.0,6592.0,18119.0,5033.0,17611.0,15409.0,9561.0,12859.0,10643.0,10904.0
4,1009,Blount,9174.0,14176.0,10377.0,6154.0,0.0,10518.0,14138.0,3855.0,...,15234.0,5581.0,12070.0,7859.0,12995.0,17852.0,12170.0,9502.0,6908.0,14294.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3024,56037,Sweetwater,12418.0,3826.0,11715.0,15409.0,17852.0,11653.0,7452.0,15429.0,...,5016.0,15973.0,6169.0,12137.0,6185.0,0.0,9706.0,8431.0,11071.0,7615.0
3025,56039,Teton,8580.0,13523.0,2488.0,9561.0,12170.0,4223.0,7797.0,14194.0,...,6350.0,6618.0,11840.0,13638.0,8661.0,9706.0,0.0,17606.0,18583.0,2124.0
3026,56041,Uinta,13509.0,4680.0,19877.0,12859.0,9502.0,17825.0,12669.0,8166.0,...,11258.0,14585.0,5853.0,8506.0,9045.0,8431.0,17606.0,0.0,2657.0,15893.0
3027,56043,Washakie,12210.0,7274.0,17230.0,10643.0,6908.0,16297.0,13628.0,5725.0,...,13485.0,12293.0,7731.0,7279.0,10776.0,11071.0,18583.0,2657.0,0.0,18465.0


In [4]:
MAX_NEIGHBORS = len(closest_neighboring_counties.columns) - 2

In [5]:
access = sheet_dict['ACCESS']
socioeconomic = sheet_dict['SOCIOECONOMIC']

In [6]:
access = access.merge(right = county_census_data.copy(), how='inner')
access = access.merge(right = closest_neighboring_counties.copy(), how='inner')
access = access.merge(right = county_centroid_distances.copy(), how='inner')

In [7]:
access['METRO13'] = socioeconomic['METRO13'].copy()

In [8]:
access = access.dropna()

In [9]:
# ~ 1/3 of counties are metro
access_metro = access.loc[access['METRO13'] == 1].copy()
access_nonmetro = access.loc[access['METRO13'] == 0].copy()
len(access_metro)/(len(access_metro) + len(access_nonmetro))

0.37405084186200066

In [10]:
# Train test split with test size of 20%, the closest neighbors (max 20) are set as training data
training_rows_per_test_row = 4

In [67]:
def remove_county_and_neighbors(df, remaining_indices, train_indices, test_indices):
    # Move next index to test set
    i = remaining_indices[0]
    county = df.iloc[i]
    test_indices.append(i)
    remaining_indices = remaining_indices[1:]
    
    # Move indices of closest neighboring counties to training set
    a = 0
    j = 0
    while a < training_rows_per_test_row and j < MAX_NEIGHBORS:
        neighbor_fips = county['NEAREST_' + str(j + 1)]
        neighbor = df.loc[df['FIPS'] == neighbor_fips]
        if len(neighbor.index > 0):
            ind = neighbor.index[0]
        else:
            ind = -1
        if ind in remaining_indices:
            train_indices.append(ind)
            remaining_indices.remove(ind)
            a += 1
        j += 1
    while a < training_rows_per_test_row and len(remaining_indices) > 0:
        train_indices.append(remaining_indices[0])
        remaining_indices = remaining_indices[1:]
        a += 1
    return remaining_indices, train_indices, test_indices

In [68]:
def distance_preserving_train_test_split(df, slice):
    remaining_indices = list(slice.index)
    remaining_indices = list(np.random.permutation(remaining_indices))
    train_indices = []
    test_indices = []
    
    while len(remaining_indices) > 0:
        remaining_indices, train_indices, test_indices = remove_county_and_neighbors(df, remaining_indices, train_indices, test_indices)
    df_train = df.iloc[train_indices]
    df_test = df.iloc[test_indices]
    return df_train, df_test

In [69]:
metro_train, metro_test = distance_preserving_train_test_split(df=access, slice=access_metro)

In [48]:
nonmetro_train, nonmetro_test = distance_preserving_train_test_split(df=access, slice=access_nonmetro)

In [52]:
access_train = pd.concat([metro_train, nonmetro_train])
access_test = pd.concat([metro_test, nonmetro_test])

In [54]:
# access_train.to_csv('sample_training_set_ACCESS.csv',index=False)
# access_test.to_csv('sample_test_set_ACCESS.csv',index=False)