In [1]:
# import libraries
import numpy as np
from scipy.spatial import cKDTree
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from shapely.geometry import Polygon
import random
import os

In [2]:
def get_geodataframe(precinct_file, district_file, data_path='../data'):
    "Read the precinct and district files from the defined data path and return precinct and district geodataframes"

    # read the geopandas precinct file
    fpath_precinct = os.path.join(data_path, precinct_file)
    precinct_gdf = gpd.read_file(fpath_precinct)

    # read the geopandas district file
    fpath_district = os.path.join(data_path, district_file)
    district_gdf = gpd.read_file(fpath_district)

    return precinct_gdf, district_gdf

In [3]:
def _make_random_points(number, polygon):
    #Generates number of uniformly distributed points in polygon
    points = []
    min_x, min_y, max_x, max_y = polygon.bounds
    i= 0
    while i < number:
        point = Point(random.uniform(min_x, max_x), random.uniform(min_y, max_y))
        if polygon.contains(point):
            points.append(point)
            i += 1
    return points

In [4]:
def random_points_in_polygon(precincts, p=0.01,
                             dem_vote_count_column="D",
                             repub_vote_count_column="R",
                             dem_uniform_swing=0,
                             random_seed=None):
    """
    :param precincts: :class:`geopandas.GeoDataFrame`
                      This is a polygon shapefile with vote totals.
    :param p: Sampling parameter.
              Probability of voter inclusion; inverse of number
              of actual voters represented by each representative
              voter.
    :param dem_vote_count_column: (default="dem")
              Name of column with Democratic vote counts per precinct.
    :param repub_vote_count_column: (default="rep")
              Name of column with Republican vote counts per precinct.
    :param random_seed: (default=None)
              Random state or seed passed to numpy.
    """
    # Make master dataframe
    gf = gpd.GeoDataFrame(columns=['Dem', 'KnnShrDem', 'geometry'])
    election = "P2008_"
    for index, row in precincts.iterrows():
        # Loop over dems and republicans
        for party in [dem_vote_count_column, repub_vote_count_column]:
            points_to_add = np.random.binomial(int(row[election + party]), p)
            points = _make_random_points(points_to_add, row.geometry)
            for point in points:
                if party == "D":
                    dem_value = 1
                else:
                    dem_value = 0

                gf = gf.append({'Dem': dem_value, 'KnnShrDem': None, 'geometry': point}, ignore_index=True)

    gf['Dem'] = gf['Dem'].astype(int)
    gf['KnnShrDem'] = gf['KnnShrDem'].astype(None)

    return gf

In [76]:
def calculate_voter_knn(voter_points, k, target_column='Dem'):
    """
        Calculation composition of nearest neigbhors.

        :param voter_points: :class:`geopandas.GeoDataFrame`.
              GeoDataFrame of voter points
        :param k: Num nearest neighbors to consider.
        :param target_column: Feature to average
    """

    tree = cKDTree(list(zip(voter_points['geometry'].x, voter_points['geometry'].y)))

    dd, ii = tree.query(list(zip(voter_points['geometry'].x, voter_points['geometry'].y)), k=k)

    its = 0
    for index, row in voter_points.iterrows():
        voter_points.at[index, f'KnnShr{target_column}'] = sum(voter_points[target_column][ii[its]]) / k
        its += 1

    return voter_points

In [6]:
def calculate_dislocation(voter_points, district, knn_column='KnnShrDem', dem_column='dem'):
    """
        Calculation difference between knn dem share
        and dem share of assigned district

        :param voter_points: :class:`geopandas.GeoDataFrame`.
              GeoDataFrame of voter points.
        :param voter_points: :class:`geopandas.GeoDataFrame`.
              GeoDataFrame of district polygons.
        :param knn_column: Column of `voter_points` with kNN scores
    """

    # Put both geodataframes in common projection
    districts = district.to_crs(voter_points.crs)

    # Calculate district dem share
    dist_voter_point = gpd.sjoin(districts, voter_points, how='inner')

    # Calculate democrat share for each district
    dist1 = dist_voter_point.groupby(['NAMELSAD']).agg(district_demshare=pd.NamedAgg(column='Dem', aggfunc=np.mean))
    dist1 = dist1.reset_index()

    # merge the dataframe with democrat share in each district to obtain district democrat share for each voter
    final_df = dist_voter_point.merge(dist1, how='left')

    # Calculate dislocation score
    final_df['dislocation'] = final_df['KnnShrDem'] - final_df['district_demshare']

    # Select relevant columns
    dislocation_score_df = final_df[['NAMELSAD', 'Dem', 'KnnShrDem', 'district_demshare', 'dislocation', 'geometry']]

    # final dataframe with dislocation score calculated for each voter
    return dislocation_score_df


In [7]:
df = gpd.GeoDataFrame({'P2008_D': [0, 1], 'P2008_R': [1, 0],
                               'geometry': [Polygon([(0, 0), (1, 1), (0, 1)]),
                                            Polygon([(0, 0), (1, 1), (0, 1)])]})
result = random_points_in_polygon(df, p=1)
benchmark = pd.Series([0, 1], name='Dem')
print(pd.Series.equals(result['Dem'], benchmark))

False


In [8]:
precinct_gdf, district_gdf = get_geodataframe("2008_presidential_precinct_counts.shp", "US_cd114th_2014.shp")

In [9]:
nc = precinct_gdf[precinct_gdf.STATE == "37"]

In [10]:
nc_random_points = random_points_in_polygon(precincts=nc)

In [78]:
nc_voter_knn = calculate_voter_knn(voter_points=nc_random_points, k=2)

In [79]:
nc_voter_knn.crs = nc.crs  # Set crs attribute

In [80]:
nc_dislocation = calculate_dislocation(voter_points=nc_voter_knn, district=district_gdf)

In [81]:
nc_dislocation.head()

Unnamed: 0,NAMELSAD,Dem,KnnShrDem,district_demshare,dislocation,geometry
0,Congressional District 1,1,1.0,0.715023,0.284977,"MULTIPOLYGON (((3576299.204 8107248.105, 35763..."
1,Congressional District 1,1,1.0,0.715023,0.284977,"MULTIPOLYGON (((3576299.204 8107248.105, 35763..."
2,Congressional District 1,0,0.0,0.715023,-0.715023,"MULTIPOLYGON (((3576299.204 8107248.105, 35763..."
3,Congressional District 1,0,0.5,0.715023,-0.215023,"MULTIPOLYGON (((3576299.204 8107248.105, 35763..."
4,Congressional District 1,1,1.0,0.715023,0.284977,"MULTIPOLYGON (((3576299.204 8107248.105, 35763..."


In [42]:
tree = cKDTree(list(zip(test['geometry'].x, test['geometry'].y)))

dd, ii = tree.query(list(zip(test['geometry'].x, test['geometry'].y)), k=2)
its = 0

In [58]:
for index, row in test.iterrows():
    row['KnnShrDem'] = 2

In [46]:
test

Unnamed: 0,Dem,KnnShrDem,geometry
0,1,,POINT (3428778.550 8008995.104)
1,1,,POINT (3431511.812 8007804.096)
2,1,,POINT (3432839.395 8009441.985)
3,0,,POINT (3437220.267 8003770.772)
4,0,,POINT (3429809.226 8006203.248)


In [21]:
type(test['Dem'])

pandas.core.series.Series

In [53]:
test.fillna(1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [62]:
test

Unnamed: 0,Dem,KnnShrDem,geometry
0,1,1.0,POINT (3428778.550 8008995.104)
1,1,1.0,POINT (3431511.812 8007804.096)
2,1,1.0,POINT (3432839.395 8009441.985)
3,0,1.0,POINT (3437220.267 8003770.772)
4,0,1.0,POINT (3429809.226 8006203.248)


In [74]:
for index, row in test.iterrows():
    test.at[index, 'KnnShrDem'] = -2

In [75]:
test

Unnamed: 0,Dem,KnnShrDem,geometry
0,1,-2.0,POINT (3428778.550 8008995.104)
1,1,-2.0,POINT (3431511.812 8007804.096)
2,1,-2.0,POINT (3432839.395 8009441.985)
3,0,-2.0,POINT (3437220.267 8003770.772)
4,0,-2.0,POINT (3429809.226 8006203.248)


In [69]:
tree = cKD
for index, row in test.iterrows():
    row['KnnShrDem'] = sum(test['Dem'][ii[its]]) / k

NameError: name 'cKD' is not defined