In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import dask

from dask.distributed import Client
from datetime import datetime

# Reduce the number of workers, because we need lots of RAM for loading the CBG shapefile
# client = Client(n_workers=2)

# CADES configuration
from dask_jobqueue import SLURMCluster
# Be careful with using too many process, or you'll run out of file descriptors
cluster = SLURMCluster(project='birthright', queue='high_mem_cd', cores=32, memory='300 GB', processes=10, walltime="4:00:00", job_extra=["-N 1"], interface="ib0")
cluster.scale(jobs=10)
client = Client(cluster)

In [2]:
def to_date(d):
    if ":" == d[-3:-2]:
        d = d[:-3]+d[-2:]
    return d

In [3]:
#For playing with one week at a time
# patterns = dd.read_csv("data/safegraph/weekly-patterns/2020-03-22-weekly-patterns.csv", dtype={'naics_code': 'float64'}).set_index('safegraph_place_id')
# Load the entire dataset
patterns = dd.read_csv("data/safegraph/weekly-patterns/*.csv", dtype={'naics_code': 'float64'}).set_index('safegraph_place_id')
patterns['date'] = patterns['date_range_start'].apply(lambda x: datetime.strptime(to_date(x), '%Y-%m-%dT%H:%M:%S%z').date(), meta=('date_range_start', 'object'))
patterns['naics_4'] = patterns['naics_code'].apply(lambda x: str(x)[0:4], meta=('naics_code', 'str'))


In [4]:
poi = dd.read_csv("data/safegraph/core/core_poi*.csv").set_index('safegraph_place_id')
poi = poi[['latitude', 'longitude']]

In [5]:
def compute_location_cbg(df):
    import geopandas as gpd
    from shapely.geometry import Point
    
    localdf = df[['latitude', 'longitude']].copy()
    
    cbgs = gpd.read_file("data/reference/census/block_groups.shp")
    cbgs.crs = 'epsg:4269'
    cbgs = cbgs.to_crs('epsg:4326')
    local_gdf = gpd.GeoDataFrame(localdf, crs="epsg:4326",\
                                       geometry=[Point(xy) for xy in \
                                                zip(localdf['longitude'], localdf['latitude'])])
    local_gdf = gpd.sjoin(local_gdf, cbgs, how='left', op='within')
    
    return local_gdf.GEOID.rename('location_cbg')
poi['location_cbg'] = poi.map_partitions(compute_location_cbg, meta=('location_cbg', 'str'))

In [6]:
df = patterns
# df = df[(df.region == 'GA') | (df.region == 'CA')]
df = df.reset_index()

In [7]:
def strings_to_int(lists):
    ints = []
    for x in lists:
        try:
            ints.append(int(x))
        except:
            ints.append(0)
    return np.array(ints).astype(np.int32)

def get_cbgs(df):   
    df = df[['safegraph_place_id','date','visitor_home_cbgs', 'visits_by_day','naics_4']]
    df = df.visitor_home_cbgs\
        .str.translate(str\
        .maketrans({'{':'', '}':'','"':''}))\
        .str.split(',')\
        .apply(pd.Series, 1)\
        .stack() \
        .reset_index(level=1, drop=True) \
        .to_frame('cbgs') \
        .merge(df, left_index=True, right_index=True)\
        .drop(['visitor_home_cbgs'], axis=1) \
        .rename(columns={0:'cbgs',1:'party'}) \
        .dropna()
    
    df = df[df.cbgs != ""]
    df['party'] = df.cbgs.str.split(':').apply(lambda x: x[-1]).astype(np.int32)
    df['cbgs'] = df.cbgs.str.split(':').apply(lambda x: x[0]).astype('str')
    df['total_visits'] = df['visits_by_day'] \
                                    .str.translate(str\
                                    .maketrans({'[':'', ']':'','"':''}))\
                                    .str.split(',') \
                                    .apply(strings_to_int).apply(np.sum).astype(np.int32) * df.party

    return(df)

df = df.map_partitions(get_cbgs, meta={'cbgs': 'str', 'safegraph_place_id': 'str', 'date': 'object', 'visits_by_day': 'object', 'naics_4': 'int32', 'party': 'int32', 'total_visits': 'int32'})

In [8]:
df = df.join(poi, on='safegraph_place_id', how='left')

In [9]:
def compute_cbg_distances(df):
    
    def loc_and_distance(x):
        return x.geometry.centroid.distance(Point(x.longitude, x.latitude)) if x.longitude and x.latitude and x.geometry else 0.0
    
    import geopandas as gpd
    from shapely.geometry import Point
    
    cbgs = gpd.read_file("data/reference/census/block_groups.shp")
    cbgs.crs = 'epsg:4269'
    cbgs = cbgs.to_crs('epsg:4326')
    df.cbgs = df.cbgs.fillna("")
    
    df = df.merge(cbgs[['GEOID', 'geometry']], left_on='cbgs', right_on='GEOID', how='left').drop(['GEOID'], axis=1)
    
    df['distance'] = df.apply(loc_and_distance, axis=1)
    
    return df.drop(['geometry'], axis=1)

df = df.map_partitions(compute_cbg_distances, meta={'cbgs': 'str', 'safegraph_place_id': 'str', 'date': 'object', 'visits_by_day': 'object', 'naics_4': 'int32', 'party': 'int32', 'total_visits': 'int32', 'latitude': 'float32', 'longitude': 'float32', 'location_cbg': 'str', 'distance': 'float64'})

In [None]:
%time df.to_parquet("data/output/SG-distance-matrix.parquet")