In [1]:
from __future__ import division, print_function

import numpy as np
import pandas as pd
import geopandas as gpd

### Overlap Method

Cross join for each neighborhood against all other shapes in the same neighborhood, done using Pandas to use `neighborhood` as the cross join key.

GeoPandas includes Shapely objects as each row item, even outside of a GeoSeries, so the area can be taken and overlap calculated from them.

In [2]:
dna_df = gpd.read_file("dna_neighborhoods.geojson")

dna_df.head()

Unnamed: 0,geometry,neighborhood
0,POLYGON ((-87.68754959106445 41.92488743920406...,bucktown
1,POLYGON ((-87.68772125244141 41.92693092226143...,bucktown
2,POLYGON ((-87.68763542175293 41.92725021057667...,bucktown
3,POLYGON ((-87.66776561737061 41.91074091042736...,bucktown
4,"POLYGON ((-87.65356063842772 41.9112199638231,...",bucktown


In [3]:
# Remove invalid geometries, warnings refer to the geometries being removed
dna_df = dna_df[dna_df['geometry'].is_valid]
dna_df["area"] = dna_df["geometry"].area
dna_df['col_index'] = dna_df.index

Self-intersection at or near point -87.686438916304667 41.910271695783507
Self-intersection at or near point -87.788491453218157 41.98601286335942
Self-intersection at or near point -87.729263305664062 41.983356261006165
Too few points in geometry component at or near point -87.6424241065979 41.892278647917657
Self-intersection at or near point -87.662614948420355 41.944892143035375
Self-intersection at or near point -87.630632952110147 41.866987407744745
Self-intersection at or near point -87.656278069661781 41.965673971769824
Self-intersection at or near point -87.651886333931159 41.973543484149701
Self-intersection at or near point -87.645185495158671 41.954885875733162
Self-intersection at or near point -87.696653134388384 41.975925779834952
Self-intersection at or near point -87.703229443024071 41.965477682669018
Self-intersection at or near point -87.717500791285502 41.941382570329075
Self-intersection at or near point -87.787422365928435 41.970491745681166
Self-intersection at o

In [4]:
# Create a csv with the counts of each neighborhood
merge_count = pd.DataFrame(pd.value_counts(dna_df['neighborhood']))
merge_count.reset_index(inplace=True)
merge_count.columns = ['neighborhood', 'count']
merge_count.to_csv('neighborhood_count.csv', index=False)
merge_count.head()

Unnamed: 0,neighborhood,count
0,ravenswood,78
1,lakeview,77
2,logan-square,75
3,lincoln-square,74
4,uptown,73


In [5]:
# Create copy of data frame to do cross join, clean up columns
dna_match = dna_df
dna_match = dna_df[['neighborhood', 'geometry', 'col_index']]
dna_match = dna_match.rename(columns={'geometry': 'geometry_y', 'col_index': 'col_index_y'})

# Create cross join on neighborhoods and drop any shapes matched against themselves
dna_merge = pd.merge(dna_df, dna_match, on='neighborhood')
dna_merge = dna_merge[dna_merge['col_index'] != dna_merge['col_index_y']]

In [6]:
# Define function to be applied against each row, more readable than lambda
def get_overlap(row):
    return row['geometry'].intersection(row['geometry_y']).area / row['area']

In [7]:
# Apply overlap function, group and write to csv
dna_merge['overlap'] = dna_merge.apply(lambda row: get_overlap(row), axis=1)

# Get the mean overlap for each neighborhood, and then bring back counts of shapes for each
dna_group = dna_merge.groupby(['neighborhood'], as_index=False)['overlap'].mean()
count_dna = pd.read_csv('neighborhood_count.csv')
dna_group = pd.merge(dna_group, count_dna, on='neighborhood')

dna_group.to_csv('neighborhood_overlap.csv', index=False)
dna_group.head()

Unnamed: 0,neighborhood,overlap,count
0,albany-park,0.599146,50
1,altgeld-gardens,0.0,3
2,andersonville,0.551603,70
3,archer-heights,0.614201,6
4,armour-square,0.519605,5
