# Exploring Census Data

In [None]:
from osgeo import ogr, osr
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
import numpy as np

In [None]:
biz_att_df = pd.read_csv('./data/business_neighbors_atts.csv')
biz_att_df.drop('Unnamed: 0', axis=1, inplace=True)

# Extract Counties and Zip Code

In [None]:
geometry = [Point(xy) for xy in zip(biz_att_df.longitude, biz_att_df.latitude)]
crs = {'init': 'epsg:4326'} #http://www.spatialreference.org/ref/epsg/2263/
geo_df = gpd.GeoDataFrame(biz_att_df, crs=crs, geometry=geometry)

In [None]:
# Get county based on lat lon
counties = gpd.read_file('data/counties/tl_2017_us_county.shp')
counties.crs = crs
counties.columns = [x.lower() for x in counties.columns]
# Rename column
counties.rename(columns={'countyfp':'fipscty'}, inplace=True)
# Get county of each.
geo_df = gpd.sjoin(geo_df, counties[['fipscty', 'geometry']], op='intersects', how='inner')
geo_df['fipscty'] = geo_df['fipscty'].apply(int)
geo_df.drop('index_right', axis=1, inplace=True)

In [None]:
zipcodes = gpd.read_file('data/zipcodes/v104/zip_poly.gdb')
zipcodes = zipcodes.loc[zipcodes['STATE']=='NV']
zipcodes.columns = [x.lower() for x in zipcodes.columns]
zipcodes.rename(columns={'zip_code':'zipcode'}, inplace=True)
zipcodes.crs = crs
geo_df = gpd.sjoin(geo_df, zipcodes[['zipcode', 'geometry', 'pop_sqmi']], op='intersects',how='inner')
geo_df['zipcode'] = geo_df['zipcode'].apply(int)


# Load Census Data

In [None]:
def add_county_data(df, year):
    cnty_df = pd.read_csv('data/census/cbp{}co.csv'.format(year))
    # Limit to restaurants only, naics codes:https://www.naics.com/six-digit-naics/?code=72
    cnty_df = cnty_df.loc[cnty_df['naics'] == '722///']
    # Drop unimportant columns
    cnty_keeplist = ['fipscty', 'emp', 'qp1', 'ap', 'est', 'n1_4', 'n5_9', 
                     'n10_19', 'n20_49', 'n50_99', 'n100_249', 'n500_999', 'n1000']
    cnty_df = cnty_df[cnty_keeplist]
    cnty_df.columns = ['cn_y{}_{}'.format(year, x) for x in cnty_df.columns]
    cnty_df.rename(columns={'cn_y{}_fipscty'.format(year):'fipscty'}, inplace=True)
    return df.merge(cnty_df, on='fipscty')

def add_zip_data(df, year):
    # NOT ALL RESTAURANTS HAVE A MATCHING ZIP
    zip_df = pd.read_csv('data/census/zbp{}detail.csv'.format(year))
    # Limit to restaurants only, naics codes:https://www.naics.com/six-digit-naics/?code=72
    zip_df = zip_df.loc[zip_df['naics'] == '72----'] # Just doing all hospitality for now
    # Drop unimportant columns
    zip_keeplist = ['zip', 'est', 'n1_4', 'n5_9', 
                    'n10_19', 'n20_49', 'n50_99', 'n100_249', 'n500_999', 'n1000']
    zip_df = zip_df[zip_keeplist]
    zip_df.columns = ['cn_y{}_zip_{}'.format(year, x) for x in zip_df.columns]
    zip_df.rename(columns={'cn_y{}_zip_zip'.format(year):'zipcode'}, inplace=True)
    return df.merge(zip_df, on='zipcode', how='left')



In [None]:
year_list = [str(y).zfill(2) for y in range(4,17)]
for y in year_list:
    geo_df = add_county_data(geo_df, y)
    geo_df = add_zip_data(geo_df, y)

In [None]:
# Missing zipcodes. There are 62 biz's in 89158 and there's not census data at all for that.
# Also 2 biz's in 89161, no census data.
# We'll have to interpolate or just forget it
zip_df = pd.read_csv('data/census/zbp05detail.csv')
geo_df['zipcode'].unique()[np.isin(geo_df['zipcode'].unique(), zip_df['zip'].unique(), invert=True)]

# Fill missing zip data

In [None]:
# Using simple mean filling
census_cols = geo_df.columns.str.startswith('cn')
geo_df.loc[:, census_cols] = geo_df.loc[:, census_cols].fillna(geo_df.loc[:, census_cols].mean())

# Year-year diffs

In [None]:
def add_year_diffs(df, year):
    census_cols_suffixes = ['emp', 'qp1', 'ap', 'est', 'n1_4', 'n5_9',
       'n10_19', 'n20_49', 'n50_99', 'n100_249',
       'n500_999', 'n1000', 'zip_est', 'zip_n1_4',
       'zip_n5_9', 'zip_n10_19', 'zip_n20_49', 'zip_n50_99',
       'zip_n100_249', 'zip_n500_999', 'zip_n1000']
    year = str(year).zfill(2)
    prev_year = str(int(year) - 1).zfill(2)
    for col in census_cols_suffixes:
        new_col = 'cn_y{}_diff_{}'.format(year, col)
        prev_vals = df['cn_y{}_{}'.format(prev_year, col)].copy()
        prev_vals[prev_vals==0] = 1
        df[new_col] = 100*((df['cn_y{}_{}'.format(year, col)] - df['cn_y{}_{}'.format(prev_year, col)])/
                        prev_vals)
    
    return df

In [None]:
for y in year_list[1:]:
    geo_df = add_year_diffs(geo_df, y)

# Extract year specific data

In [None]:
# Generate random years for testing
geo_df['open_year'] = np.random.randint(2005, 2016, geo_df.shape[0])

In [None]:
def extract_open_year_data(df):
    census_cols_suffixes = ['emp', 'qp1', 'ap', 'est', 'n1_4', 'n5_9',
   'n10_19', 'n20_49', 'n50_99', 'n100_249',
   'n500_999', 'n1000', 'zip_est', 'zip_n1_4',
   'zip_n5_9', 'zip_n10_19', 'zip_n20_49', 'zip_n50_99',
   'zip_n100_249', 'zip_n500_999', 'zip_n1000']
    df['open_year_str'] = df['open_year'].apply(str).str.replace('20', '')
    for col in census_cols_suffixes:
        df['cn_opyear_{}'.format(col)] = [r[1]['cn_y{}_{}'.format(r[1]['open_year_str'],col)] for r in df.iterrows()]
        
    return df

In [None]:
final_df = extract_open_year_data(geo_df)

# Cutdown and save

In [None]:
final_df.drop(final_df.columns[final_df.columns.str.startswith('cn_y')], inplace=True, axis=1)
final_df.drop(['index_right', 'geometry'], inplace=True, axis=1)
final_df.to_csv('./data/business_neighbors_census_atts.csv')