In [None]:
# Set of functions used in notebooks to interact and visualize with the series data

In [2]:
# import libraries
import pandas as pd
import numpy as np
import shapefile as shp
import matplotlib.pyplot as plt
import seaborn as sns
import random
from shapely.geometry import Polygon, Point
from shapely.wkb import loads

In [3]:
# Prepare the data for use. Note that this process is specific to the columns selected for this analysis.

def clean_data(file):
    """
    What it does
    
    Argument:
    file -- the file containing relevant data
    
    Returns:
    demog_data -- cleaned demographic data

    """
    
    #read data file (csv)
    demog = pd.read_csv(file)
    demog = demog.iloc[1:]
    
    # select data to keep
    keep_cols = ['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03','HD01_VD04','HD01_VD05','HD01_VD06','HD01_VD07','HD01_VD08']

    # subset raw data by keep_cols
    demog_data = demog[keep_cols]

    #rename columns to be easier to manipulate
    demog_data.rename(columns = {'GEO.display-label' : 'GEO_display_label',
                                 'HD01_VD01':'total_pop',
                                 'HD01_VD02':'tot_whites',
                                 'HD01_VD03':'tot_blacks',
                                 'HD01_VD04':'tot_natives',
                                 'HD01_VD05':'tot_asians',
                                 'HD01_VD06':'tot_hawaiians',
                                 'HD01_VD07':'tot_other',
                                 'HD01_VD08':'tot_twoplus'},inplace=True)


    # pull apart the location data so that we can see the census tract + the boro county as two separate columns
    new = demog_data['GEO_display_label'].str.split(',', n = 2, expand = True) 

    demog_data['census_tract'] = new[0].str.strip()
    demog_data['boro'] = new[1].str.strip()

    # convert the count data to integers
    for col in ['total_pop','tot_whites','tot_blacks','tot_natives','tot_asians','tot_hawaiians','tot_other','tot_twoplus']:
        demog_data[col] = demog_data[col].astype(int)

    return demog_data

In [4]:
## Many of the following functions was informed by the shapefile tutorial here: https://towardsdatascience.com/mapping-geograph-data-in-python-610a963d2d7f

In [5]:
# read in the shapefiles

def read_shapefile(sf):
    fields = [x[0] for x in sf.fields][1:]
    records = sf.records()
    shps = [s.points for s in sf.shapes()]
    df = pd.DataFrame(columns=fields, data=records)
    df = df.assign(coords=shps)
    return df

In [6]:
# fill a single census tract

def plot_map_fill(id, sf, x_lim = None, 
                          y_lim = None, 
                          figsize = (11,9), 
                          color = 'r'):
    plt.figure(figsize = figsize)
    fig, ax = plt.subplots(figsize = figsize)
    for shape in sf.shapeRecords():
        x = [i[0] for i in shape.shape.points[:]]
        y = [i[1] for i in shape.shape.points[:]]
        ax.plot(x, y, 'k',linewidth=0.5)
        
    shape_ex = sf.shape(id)
    x_lon = np.zeros((len(shape_ex.points),1))
    y_lat = np.zeros((len(shape_ex.points),1))
    for ip in range(len(shape_ex.points)):
        x_lon[ip] = shape_ex.points[ip][0]
        y_lat[ip] = shape_ex.points[ip][1]
    ax.fill(x_lon,y_lat, color)
    
    if (x_lim != None) & (y_lim != None):     
        plt.xlim(x_lim)
        plt.ylim(y_lim)

In [7]:
# fill in multiple census tracts on same map

def plot_map_fill_multiples_ids(title, comuna, sf, x_lim = None, y_lim = None, figsize = (11,9), color = 'r'):
    plt.figure(figsize = figsize)
    fig, ax = plt.subplots(figsize = figsize)
    fig.suptitle(title, fontsize=16)
    for shape in sf.shapeRecords():
        x = [i[0] for i in shape.shape.points[:]]
        y = [i[1] for i in shape.shape.points[:]]
        ax.plot(x, y, 'k', linewidth=0.5)
            
    for id in comuna:
        shape_ex = sf.shape(id)
        x_lon = np.zeros((len(shape_ex.points),1))
        y_lat = np.zeros((len(shape_ex.points),1))
        for ip in range(len(shape_ex.points)):
            x_lon[ip] = shape_ex.points[ip][0]
            y_lat[ip] = shape_ex.points[ip][1]
        ax.fill(x_lon,y_lat, color)
             
        x0 = np.mean(x_lon)
        y0 = np.mean(y_lat)
    
    if (x_lim != None) & (y_lim != None):     
        plt.xlim(x_lim)
        plt.ylim(y_lim)

In [8]:
# plot the full map, color code by some demographic variable

def plot_map(sf, x_lim = None, y_lim = None, figsize = (11,9)):
    plt.figure(figsize = figsize)
    id=0
    for shape in sf.shapeRecords():
        x = [i[0] for i in shape.shape.points[:]]
        y = [i[1] for i in shape.shape.points[:]]
        plt.plot(x, y, 'k',linewidth=0.5)
        
        if (x_lim == None) & (y_lim == None):
            x0 = np.mean(x)
            y0 = np.mean(y)
        id = id+1
    
    if (x_lim != None) & (y_lim != None):     
        plt.xlim(x_lim)
        plt.ylim(y_lim)

In [9]:
# plot a single polygon shape

def plot_shape(id, sf, s=None):
    plt.figure()
    ax = plt.axes()
    ax.set_aspect('equal')
    shape_ex = sf.shape(id)
    x_lon = np.zeros((len(shape_ex.points),1))
    y_lat = np.zeros((len(shape_ex.points),1))
    for ip in range(len(shape_ex.points)):
        x_lon[ip] = shape_ex.points[ip][0]
        y_lat[ip] = shape_ex.points[ip][1]
    plt.plot(x_lon,y_lat) 
    x0 = np.mean(x_lon)
    y0 = np.mean(y_lat)
    plt.text(x0, y0, s, fontsize=10)
    plt.xlim(shape_ex.bbox[0],shape_ex.bbox[2])
    return x0, y0

In [10]:
# generate random points within a particular spatial polygon

def random_points_within(poly, num_points):
    min_x, min_y, max_x, max_y = poly.bounds
    points = []
    while len(points) < num_points:
        random_point = Point([random.uniform(min_x, max_x), random.uniform(min_y, max_y)])
        if (random_point.within(poly)):
            points.append(random_point)
    return points

In [11]:
# generate a list of individuals where the number of people of each race is based on the census data from each tract, and race is randomly assigned
def generate_people(shape, demographics, boro, tract):
    
    boro_conversion = {'Bronx County' : '005',
             'Kings County' : '047',
             'New York County' : '061',
             'Queens County' : '081',
             'Richmond County' : '085'}

    boro_num = boro_conversion[boro]

    coordinates = shape.loc[(shape.countyfp==boro_num) & (shape.namelsad == tract),'coords'].iloc[0]
    
    population = demographics.loc[(demographics.boro==boro) & (demographics.census_tract==tract),'total_pop'].iloc[0]

    # get the number of people of each race
    white = demographics.loc[(demographics.boro==boro) & (demographics.census_tract==tract),'tot_whites'].iloc[0]
    black = demographics.loc[(demographics.boro==boro) & (demographics.census_tract==tract),'tot_blacks'].iloc[0]
    native = demographics.loc[(demographics.boro==boro) & (demographics.census_tract==tract),'tot_natives'].iloc[0]
    asian = demographics.loc[(demographics.boro==boro) & (demographics.census_tract==tract),'tot_asians'].iloc[0]
    hawaii = demographics.loc[(demographics.boro==boro) & (demographics.census_tract==tract),'tot_hawaiians'].iloc[0]
    other = demographics.loc[(demographics.boro==boro) & (demographics.census_tract==tract),'tot_other'].iloc[0]
    two_plus = demographics.loc[(demographics.boro==boro) & (demographics.census_tract==tract),'tot_twoplus'].iloc[0]

    # create a randomly ordered list of peoples' races
    codes = ['white']*white+['black']*black+['native']*native+['asian']*asian+['hawaiian']*hawaii+['other']*other+['two_plus']*two_plus
    random.shuffle(codes)
    
    return coordinates, population, codes

In [12]:
# calculate the Euclidean distance between a point and the centroid

def calc_euclidean(x,y,x_ref,y_ref):
    distance = math.sqrt(((x-x_ref)**2)+(y-y_ref)**2)
    return distance