In [17]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from shapely.geometry import Point
import geopandas as gpd
import numpy as np

### Load data

In [18]:
moves = dd.read_csv(
    '../data/moves_w_dists.csv',
    dtype={'PROP_CENSUSTRACT_from': str, 'PROP_FIPSCD_from': str, 'PROP_CENSUSTRACT_to': str, 'PROP_FIPSCD_to': str},
    blocksize=25e6).set_index('pid')

In [19]:
moves.head()

Unnamed: 0_level_0,from_addrid,from_lat,from_lon,PROP_FIPSCD_from,PROP_CENSUSTRACT_from,PROP_MUNINAME_from,PROP_OWNEROCC_from,PROP_QLTY_from,PROP_VALCALC_from,PROP_UNVBLDSQFT_from,...,PROP_FIPSCD_to,PROP_CENSUSTRACT_to,PROP_MUNINAME_to,PROP_OWNEROCC_to,PROP_QLTY_to,PROP_VALCALC_to,PROP_UNVBLDSQFT_to,PROP_BEDRMS_to,to_effdate,distance
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Y39394000021058,65593649.0,37.8882,-122.545,6041,1282002004,MILL VALLEY SCHOOL AREA TAMAL,A,QGO,612199.0,2297.0,...,6041,1261002011,MILL VALLEY,O,QGO,784444.0,1300.0,2.0,201303.0,1382.667421
Y39394000021058,213725788.0,37.8971,-122.534,6041,1261002011,MILL VALLEY,O,QGO,784444.0,1300.0,...,6041,1241001017,MILL VALLEY SCHOOL AREA ALTO,,QGO,592507.0,2890.0,5.0,201602.0,2039.66458
Y39394000054683,117484545.0,37.9001,-122.274,6001,4212004007,BERKELEY INCORP,A,QAV,85981.0,1466.0,...,6001,4212001004,BERKELEY INCORP,O,QAV,811687.0,1078.0,2.0,201304.0,441.693361
Y39394000108630,65353580.0,37.775108,-122.48973,6075,478013005,,,,130942.0,3400.0,...,6001,4014002010,OAKLAND INCORP,A,QAV,326197.0,1269.0,2.0,201808.0,19625.242557
Y39394000231102,124297011.0,37.9828,-121.837,6013,3551072003,,O,QGO,316975.0,2522.0,...,6013,3040052164,,O,,271688.0,2723.0,3.0,201007.0,20241.509295


In [20]:
moves.npartitions

38

In [21]:
demog_cols = ['PID', 'AGE', 'LOR', 'HOMEOWNERCD', 'EHI', 'PCTB', 'PCTW', 'PCTA', 'PCTH']

In [22]:
demog_dtypes = {col: float if col in ['AGE', 'LOR'] else str for col in demog_cols}

In [23]:
demog = dd.read_csv(
    '/home/data/infutor/NARC3/bay_area_mover*.csv',
    usecols=demog_cols,
    assume_missing=True,
    dtype=demog_dtypes).set_index('PID')

In [24]:
demog.head()

Unnamed: 0_level_0,AGE,LOR,HOMEOWNERCD,EHI,PCTB,PCTW,PCTH,PCTA
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Y39394000101840,,9.0,H,D,S,I,O,R
Y39394000112368,78.0,13.0,H,D,T,C,R,T
Y39394000159895,,1.0,,H,T,E,S,S
Y39394000190759,52.0,8.0,H,H,T,I,Q,O
Y39394000215765,73.0,15.0,R,E,S,M,P,N


### Merge data

In [25]:
moves_w_demog = moves.merge(demog, left_index=True, right_index=True)

In [26]:
with ProgressBar():
    moves_w_demog = moves_w_demog.compute()

[########################################] | 100% Completed |  2min 22.0s


In [27]:
moves_w_demog.head()

Unnamed: 0,from_addrid,from_lat,from_lon,PROP_FIPSCD_from,PROP_CENSUSTRACT_from,PROP_MUNINAME_from,PROP_OWNEROCC_from,PROP_QLTY_from,PROP_VALCALC_from,PROP_UNVBLDSQFT_from,...,to_effdate,distance,AGE,LOR,HOMEOWNERCD,EHI,PCTB,PCTW,PCTH,PCTA
Y39394000231102,124297011.0,37.9828,-121.837,6013,3551072003,,O,QGO,316975.0,2522.0,...,201007.0,20241.509295,93.0,3.0,,A,S,F,P,S
Y39394000242239,52245208.0,37.9266,-122.389,6013,3780001182,,A,,331019.0,1697.0,...,201610.0,8068.106654,93.0,15.0,H,D,S,F,Q,S
Y39394000314991,186863857.0,37.774,-122.26,6001,4279004000,ALAMEDA INCORP,A,QAV,309321.0,1621.0,...,201706.0,1601.150776,48.0,8.0,H,G,R,P,R,L
Y39394000381839,47811725.0,37.5774,-121.993,6001,4412002003,FREMONT INCORP,O,QAV,355445.0,1140.0,...,201304.0,21687.900364,76.0,6.0,R,G,T,H,R,P
Y39394000381839,83668792.0,37.751273,-122.43469,6075,212002008,,,,1009257.0,3175.0,...,200310.0,43484.264229,76.0,6.0,R,G,T,H,R,P


In [28]:
num_total_moves = len(moves)
num_moves_w_demog = len(moves_w_demog)

In [29]:
print('Matched movers to demographic profiles '
      'for {0} of {1} ({2}%) of relocation records.'.format(
        str(num_moves_w_demog), str(num_total_moves), str(round(100 * num_moves_w_demog / num_total_moves, 1))))

Matched movers to demographic profiles for 2832194 of 4610235 (61.4%) of relocation records.


### Save results

In [30]:
moves_w_demog.to_csv('../data/movers.csv')