In [None]:
from os import path
import geopandas as gpd
import matplotlib
import pandas as pd


# Force matplotlib to plot from notebook
%matplotlib inline
# Increase default plot size
matplotlib.rcParams['figure.figsize'] = (20.0, 20.0)

# Read in GeoJSON data
neighborhoods = gpd.read_file(path.join('raw', 'CPD_Neighborhoods.json'))

# Drop extraneous columns
neighborhoods = neighborhoods.drop([
    'Beat', 
    'District', 
    'Area_SQMI', 
    'Shape_Area', 
    'Shape_Leng', 
    'FID', 
    'Pop_2010'
], axis=1)

# Rename columns
neighborhoods = neighborhoods.rename(columns={
    'NHOOD': 'neighborhood_raw',
})

In [None]:
neighborhood_name_mappings = pd.DataFrame([
    ['AVONDALE', 'Avondale'],
    ['BONDHILL', 'Bond Hill'],
    ['C.B.D./RIVERFRONT', 'Downtown'],
    ['CALIFORNIA', 'California'],
    ['CAMP WASHINGTON', 'Camp Washington'],
    ['CARTHAGE', 'Carthage'],
    ['CLIFTON', 'Clifton'],
    ['COLLEGE HILL', 'College Hill'],
    ['COLUMBIA TUSCULUM', 'Columbia Tusculum'],
    ['CORRYVILLE', 'Corryville'],
    ['CUF', 'CUF'],
    ['EAST END', 'East End'],
    ['EAST PRICE HILL', 'East Price Hill'],
    ['EAST WALNUT HILLS', 'East Walnut Hills'],
    ['EAST WESTWOOD', 'East Westwood'],
    ['ENGLISH WOODS', 'English Woods'],
    ['EVANSTON', 'Evanston'],
    ['FAY APARTMENTS', 'Villages at Roll Hill'],
    ['HARTWELL', 'Hartwell'],
    ['HYDE PARK', 'Hyde Park'],
    ['KENNEDY HEIGHTS', 'Kennedy Heights'],
    ['LINWOOD', 'Linwood'],
    ['LOWER PRICE HILL', 'Lower Price Hill'],
    ['MADISONVILLE', 'Madisonville'],
    ['MILLVALE', 'Millvale'],
    ['MOUNT ADAMS', 'Mt. Adams'],
    ['MOUNT AIRY', 'Mt. Airy'],
    ['MOUNT AUBURN', 'Mt. Auburn'],
    ['MT. LOOKOUT', 'Mt. Lookout'],
    ['MT. WASHINGTON', 'Mt. Washington'],
    ['NORTH AVONDALE', 'North Avondale'],
    ['NORTH FAIRMOUNT', 'North Fairmount'],
    ['NORTHSIDE', 'Northside'],
    ['OAKLEY', 'Oakley'],
    ['OVER THE RHINE', 'Over-the-Rhine'],
    ['PADDOCK HILLS', 'Paddock Hills'],
    ['PENDLETON', 'Pendleton'],
    ['PLEASANT RIDGE', 'Pleasant Ridge'],
    ['QUEENSGATE', 'Queensgate'],
    ['RIVERSIDE', 'Riverside'],
    ['ROSELAWN', 'Roselawn'],
    ['SAYLER PARK', 'Sayler Park'],
    ['SEDAMSVILLE', 'Sedamsville'],
    ['SOUTH CUMMINSVILLE', 'South Cumminsville'],
    ['SOUTH FAIRMOUNT', 'South Fairmount'],
    ['SPRING GROVE VILLAGE', 'Spring Grove Village'],
    ['WALNUT HILLS', 'Walnut Hills'],
    ['WEST END', 'West End'],
    ['WEST PRICE HILL', 'West Price Hill'],
    ['WESTWOOD', 'Westwood'],
    ['WINTON HILLS', 'Winton Hills']
], columns=['neighborhood_raw', 'neighborhood'])

In [None]:
# Merge cleaned names onto data
neighborhoods = neighborhoods.merge(neighborhood_name_mappings)
neighborhoods = neighborhoods.drop(['neighborhood_raw'], axis=1)

In [None]:
# Read in cleaned data
crimes_and_pops = pd.read_csv(path.join('..', 'data', 'clean', 'crimes_and_pops_counts.csv'), dtype={
    'district': str, 
    'crimes': int,
    'population': int
})

# Lowercase neighborhood for join
# crimes_and_pops['neighborhood_lower'] = crimes_and_pops.neighborhood.str.lower() 
# neighborhoods['neighborhood_lower'] = neighborhoods.neighborhood.str.lower()

# Merge with shape data
neighborhoods_merged = neighborhoods.merge(crimes_and_pops)

In [None]:
# Add region to create "unit zero" to check if neighborhood is on outer edge
region = gpd.read_file(path.join('raw', 'region.json'))

# Merge neighborhoods for a faster set operation with the region boundary
neighborhoods = neighborhoods_merged.query('district != "2"').query('district != "3"')

# Get Differences to extract holes and bounding area
border = gpd.overlay(region, neighborhoods, how='difference')

In [None]:
# Drop extraneous columns
border_clean = border.drop([
    'NHOOD', 
    'crimes', 
    'district', 
    'population'
], axis=1)

# Name the differenced areas to preserve contiguity in the final region
border_clean.iloc[0].neighborhood = 'Border'
border_clean.iloc[1].neighborhood = 'Edgemont'
border_clean.iloc[2].neighborhood = 'EPSB'

In [None]:
# Add the border and "hole" neighborhoods back into the data
neighborhoods_final = border_clean.append(neighborhoods).reset_index(drop=True)

# Remove Edgemont and Elmwood Place/St. Bernard (EPSB) from the dataset
neighborhoods_final = neighborhoods_final.query('neighborhood != "Edgemont"').query('neighborhood != "EPSB"')

# Write to file
neighborhoods_final.to_file(path.join('clean', 'neighborhoods.shp'))
neighborhoods_final.to_file(path.join('clean', 'neighborhoods.json'))
# neighborhoods_final.query('neighborhood != "Border"').to_file(path.join('clean', 'neighborhoods.shp'))

In [None]:
neighborhoods_final

In [None]:
neighborhoods_final.plot()