In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from shapely.geometry import Point
import geopandas as gpd
import numpy as np

### Load data

In [2]:
moves = dd.read_parquet('../data/moves_long.parquet', columns=['pid', 'from_addrid', 'from_effdate', 'to_addrid', 'to_effdate'])

In [3]:
moves.head()

Unnamed: 0,pid,from_addrid,from_effdate,to_addrid,to_effdate
0,Y39394398562992,202102955.0,201211.0,29500074.0,201406.0
0,Y39394652594475,119078295.0,197806.0,45967481.0,199304.0
0,Y39394886301278,32888015.0,199304.0,8971746.0,199406.0
0,Y39394652594475,45967481.0,199304.0,224561475.0,200002.0
0,Y39394445944928,187599562.0,200404.0,9920094.0,200912.0


In [4]:
properties = dd.read_csv(
    '/home/data/infutor/Property/bay_area_properties*',
    dtype={'PROP_CENSUSTRACT': str, 'PROP_FIPSCD': str},
    usecols=['ADDRID', 'ADDRID2', 'DPV', 'PROP_LATITUDE', 'PROP_LONGITUDE', 'PROP_FIPSCD',
             'PROP_CENSUSTRACT', 'PROP_MUNINAME', 'PROP_OWNEROCC', 'PROP_QLTY',
             'PROP_VALCALC', 'PROP_UNVBLDSQFT', 'PROP_BEDRMS']
)

In [5]:
properties.head()

Unnamed: 0,DPV,PROP_FIPSCD,PROP_CENSUSTRACT,PROP_MUNINAME,PROP_LATITUDE,PROP_LONGITUDE,PROP_OWNEROCC,PROP_QLTY,PROP_VALCALC,PROP_UNVBLDSQFT,PROP_BEDRMS,ADDRID,ADDRID2
0,,6041,1011001023,NOVATO UNIFIED SCHOOL AREA,,,,,,,,248051559,239867607
1,,6095,2501052020,VALLEJO,,,A,,,,,660643601,213239347
2,,6095,2524022097,UNINCORPORATED,,,,,,,,248051559,137069087
3,,6095,2522022025,FAIRFIELD,,,,,,,,248051559,83681844
4,,6095,2523051142,UNINCORPORATED,,,,,,,,248051559,140602177


### Filter data

In [6]:
prop_w_geog = properties[
    properties['PROP_LATITUDE'].notnull() & properties['PROP_LONGITUDE'].notnull() & 
    properties['DPV'].isin(['D', 'S', 'Y']) &
    properties['PROP_FIPSCD'].isin(['06001', '06013', '06041', '06055', '06075', '06081', '06085', '06095', '06097'])
]

### Merge w/ properties

#### Merge on from ADDRID

In [7]:
moves = moves.set_index('from_addrid')

In [8]:
prop_w_geog = prop_w_geog.set_index('ADDRID')

In [9]:
prop_w_geog.head()

Unnamed: 0_level_0,DPV,PROP_FIPSCD,PROP_CENSUSTRACT,PROP_MUNINAME,PROP_LATITUDE,PROP_LONGITUDE,PROP_OWNEROCC,PROP_QLTY,PROP_VALCALC,PROP_UNVBLDSQFT,PROP_BEDRMS,ADDRID2
ADDRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1838,Y,6081,6057002008,,37.5408,-122.347,O,,3927917.0,5670.0,5.0,1838
1840,Y,6081,6009001019,,37.7048,-122.48,A,,93198.0,1330.0,2.0,20814887
1843,Y,6075,113002003,,37.793109,-122.40902,O,,1159096.0,1225.0,2.0,1843
1845,Y,6075,131012005,,37.791416,-122.42425,O,QEX,776701.0,1500.0,2.0,1845
1846,Y,6075,214002003,,37.749214,-122.42833,O,,3774000.0,1250.0,,1846


In [10]:
merged_1 = moves.merge(prop_w_geog, left_index=True, right_index=True)

In [11]:
with ProgressBar():
    merged_1 = merged_1.rename(columns={'PROP_LATITUDE': 'from_lat', 'PROP_LONGITUDE': 'from_lon'}).compute()

[########################################] | 100% Completed | 54.6s


In [12]:
moves_w_from_geog = merged_1.reset_index().rename(columns={'index': 'from_addrid'})

In [13]:
moves_w_from_geog.head()

Unnamed: 0,from_addrid,pid,from_effdate,to_addrid,to_effdate,DPV,PROP_FIPSCD,PROP_CENSUSTRACT,PROP_MUNINAME,from_lat,from_lon,PROP_OWNEROCC,PROP_QLTY,PROP_VALCALC,PROP_UNVBLDSQFT,PROP_BEDRMS,ADDRID2
0,1838.0,Y39394800924986,200212.0,201439665.0,200509.0,Y,6081,6057002008,,37.5408,-122.347,O,,3927917.0,5670.0,5.0,1838
1,1840.0,Y39394431814516,198007.0,20814887.0,199912.0,Y,6081,6009001019,,37.7048,-122.48,A,,93198.0,1330.0,2.0,20814887
2,1840.0,Y39394830916146,199606.0,84669265.0,200002.0,Y,6081,6009001019,,37.7048,-122.48,A,,93198.0,1330.0,2.0,20814887
3,1843.0,Y39394615847176,199901.0,56523799.0,200004.0,Y,6075,113002003,,37.793109,-122.40902,O,,1159096.0,1225.0,2.0,1843
4,1845.0,Y39394668254684,198007.0,71161461.0,198503.0,Y,6075,131012005,,37.791416,-122.42425,O,QEX,776701.0,1500.0,2.0,1845


In [14]:
len(moves_w_from_geog)

5769536

#### Merge on to ADDRID

In [15]:
moves_w_from_geog = moves_w_from_geog.set_index('to_addrid')

In [16]:
merged_2 = prop_w_geog.merge(moves_w_from_geog, left_index=True, right_index=True, suffixes=('_to', '_from'))

In [17]:
with ProgressBar():
    merged_2 = merged_2.rename(columns={'PROP_LATITUDE': 'to_lat', 'PROP_LONGITUDE': 'to_lon'}).compute()

[########################################] | 100% Completed |  1min 21.4s


In [18]:
moves_w_geog = merged_2.reset_index().rename(columns={'index': 'to_addrid'})

In [19]:
moves_w_geog.head()

Unnamed: 0,to_addrid,DPV_to,PROP_FIPSCD_to,PROP_CENSUSTRACT_to,PROP_MUNINAME_to,to_lat,to_lon,PROP_OWNEROCC_to,PROP_QLTY_to,PROP_VALCALC_to,...,PROP_CENSUSTRACT_from,PROP_MUNINAME_from,from_lat,from_lon,PROP_OWNEROCC_from,PROP_QLTY_from,PROP_VALCALC_from,PROP_UNVBLDSQFT_from,PROP_BEDRMS_from,ADDRID2_from
0,1838,Y,6081,6057002008,,37.5408,-122.347,O,,3927917.0,...,6057004012,,37.5563,-122.353,O,,4767552.0,5290.0,4.0,70211557
1,1838,Y,6081,6057002008,,37.5408,-122.347,O,,3927917.0,...,6028003002,,37.6557,-122.485,O,,439898.0,2790.0,3.0,201439665
2,1840,Y,6081,6009001019,,37.7048,-122.48,A,,93198.0,...,5113021024,PALO ALTO CITY,37.4452,-122.165,A,QAV,409681.0,1307.0,4.0,138608104
3,1851,Y,6075,216001001,,37.747154,-122.43502,A,QAV,407914.0,...,4046002008,OAKLAND INCORP,37.8277,-122.187,O,QAV,630299.0,3047.0,4.0,222110933
4,1852,Y,6075,307002001,,37.735242,-122.44147,O,,699320.0,...,1522023002,,38.464289,-122.67699,O,QGO,361090.0,2136.0,4.0,145806123


#### Drop duplicates

In [20]:
uniq_moves_w_geog = moves_w_geog.drop_duplicates(['pid', 'from_addrid', 'to_addrid'])

In [21]:
num_total_moves = len(moves)
num_matched_moves = len(uniq_moves_w_geog)
print(
    'Matched both properties to property records with geographies '
    'for {0} of {1} ({2}%) of relocation records.'.format(
        str(num_matched_moves), str(num_total_moves), str(round(100 * num_matched_moves / num_total_moves, 1))))

Matched both properties to property records with geographies for 3641224 of 11221531 (32.4%) of relocation records.


#### Compute move distances

In [22]:
uniq_moves_w_geog2 = uniq_moves_w_geog[[
    'pid', 'from_addrid', 'from_lat', 'from_lon', 'PROP_FIPSCD_from', 'PROP_CENSUSTRACT_from', 'PROP_MUNINAME_from',
    'PROP_OWNEROCC_from', 'PROP_QLTY_from', 'PROP_VALCALC_from', 'PROP_UNVBLDSQFT_from', 'PROP_BEDRMS_from', 'from_effdate',
    'to_addrid', 'to_lat', 'to_lon', 'PROP_FIPSCD_to', 'PROP_CENSUSTRACT_to', 'PROP_MUNINAME_to', 'PROP_OWNEROCC_to',
    'PROP_QLTY_to', 'PROP_VALCALC_to', 'PROP_UNVBLDSQFT_to', 'PROP_BEDRMS_to', 'to_effdate'
]]

In [23]:
def get_dist(df):
    
    moves_w_geog = df.copy()
    moves_w_geog['from_coords'] = list(zip(moves_w_geog.from_lon, moves_w_geog.from_lat))
    moves_w_geog['to_coords'] = list(zip(moves_w_geog.to_lon, moves_w_geog.to_lat))
    moves_w_geog['from_coords'] = moves_w_geog['from_coords'].apply(Point)
    moves_w_geog['to_coords'] = moves_w_geog['to_coords'].apply(Point)
    from_gs = gpd.GeoSeries(moves_w_geog['from_coords'], crs={'init' :'epsg:4326'}).to_crs(epsg='2768')
    to_gs = gpd.GeoSeries(moves_w_geog['to_coords'], crs={'init' :'epsg:4326'}).to_crs(epsg='2768')
    dists = from_gs.distance(to_gs)
    moves_w_geog['distance'] = dists

    return moves_w_geog[np.append(df.columns, 'distance')]

In [24]:
%%time
ddf = dd.from_pandas(
    uniq_moves_w_geog2,
    npartitions=10000)

CPU times: user 14.7 s, sys: 966 ms, total: 15.6 s
Wall time: 15.4 s


In [25]:
ddf.head()

Unnamed: 0,pid,from_addrid,from_lat,from_lon,PROP_FIPSCD_from,PROP_CENSUSTRACT_from,PROP_MUNINAME_from,PROP_OWNEROCC_from,PROP_QLTY_from,PROP_VALCALC_from,...,to_lon,PROP_FIPSCD_to,PROP_CENSUSTRACT_to,PROP_MUNINAME_to,PROP_OWNEROCC_to,PROP_QLTY_to,PROP_VALCALC_to,PROP_UNVBLDSQFT_to,PROP_BEDRMS_to,to_effdate
0,Y39394867335546,70211557.0,37.5563,-122.353,6081,6057004012,,O,,4767552.0,...,-122.347,6081,6057002008,,O,,3927917.0,5670.0,5.0,200002.0
1,Y39394800924986,201439665.0,37.6557,-122.485,6081,6028003002,,O,,439898.0,...,-122.347,6081,6057002008,,O,,3927917.0,5670.0,5.0,201003.0
2,Y39394610202108,145654720.0,37.4452,-122.165,6085,5113021024,PALO ALTO CITY,A,QAV,409681.0,...,-122.48,6081,6009001019,,A,,93198.0,1330.0,2.0,201003.0
3,Y39394843362955,222110933.0,37.8277,-122.187,6001,4046002008,OAKLAND INCORP,O,QAV,630299.0,...,-122.43502,6075,216001001,,A,QAV,407914.0,1145.0,2.0,200204.0
4,Y39394415231837,145806123.0,38.464289,-122.67699,6097,1522023002,,O,QGO,361090.0,...,-122.44147,6075,307002001,,O,,699320.0,2642.0,3.0,199207.0


In [26]:
dtypes = uniq_moves_w_geog2.dtypes.apply(lambda x: x.name).to_dict()
dtypes.update({'distance': 'float'})

In [27]:
ddf = ddf.map_partitions(get_dist, meta=dtypes)

In [28]:
with ProgressBar():
    moves_w_dists = ddf.compute()

[########################################] | 100% Completed | 50min 40.9s


In [29]:
moves_w_dists.head()

Unnamed: 0,pid,from_addrid,from_lat,from_lon,PROP_FIPSCD_from,PROP_CENSUSTRACT_from,PROP_MUNINAME_from,PROP_OWNEROCC_from,PROP_QLTY_from,PROP_VALCALC_from,...,PROP_FIPSCD_to,PROP_CENSUSTRACT_to,PROP_MUNINAME_to,PROP_OWNEROCC_to,PROP_QLTY_to,PROP_VALCALC_to,PROP_UNVBLDSQFT_to,PROP_BEDRMS_to,to_effdate,distance
0,Y39394867335546,70211557.0,37.5563,-122.353,6081,6057004012,,O,,4767552.0,...,6081,6057002008,,O,,3927917.0,5670.0,5.0,200002.0,1800.050142
1,Y39394800924986,201439665.0,37.6557,-122.485,6081,6028003002,,O,,439898.0,...,6081,6057002008,,O,,3927917.0,5670.0,5.0,201003.0,17638.119062
2,Y39394610202108,145654720.0,37.4452,-122.165,6085,5113021024,PALO ALTO CITY,A,QAV,409681.0,...,6081,6009001019,,A,,93198.0,1330.0,2.0,201003.0,40053.03682
3,Y39394843362955,222110933.0,37.8277,-122.187,6001,4046002008,OAKLAND INCORP,O,QAV,630299.0,...,6075,216001001,,A,QAV,407914.0,1145.0,2.0,200204.0,23603.668469
4,Y39394415231837,145806123.0,38.464289,-122.67699,6097,1522023002,,O,QGO,361090.0,...,6075,307002001,,O,,699320.0,2642.0,3.0,199207.0,83514.351216


In [30]:
moves_w_dists.shape

(3641224, 26)

### Merge demographics

In [31]:
%%time
ddf = dd.from_pandas(
    moves_w_dists,
    npartitions=38).set_index('pid')

CPU times: user 26.2 s, sys: 0 ns, total: 26.2 s
Wall time: 26 s


In [32]:
ddf.head()

Unnamed: 0_level_0,from_addrid,from_lat,from_lon,PROP_FIPSCD_from,PROP_CENSUSTRACT_from,PROP_MUNINAME_from,PROP_OWNEROCC_from,PROP_QLTY_from,PROP_VALCALC_from,PROP_UNVBLDSQFT_from,...,PROP_FIPSCD_to,PROP_CENSUSTRACT_to,PROP_MUNINAME_to,PROP_OWNEROCC_to,PROP_QLTY_to,PROP_VALCALC_to,PROP_UNVBLDSQFT_to,PROP_BEDRMS_to,to_effdate,distance
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Y39394000021058,213725788.0,37.8971,-122.534,6041,1261002011,MILL VALLEY,O,QGO,784444.0,1300.0,...,6041,1241001017,MILL VALLEY SCHOOL AREA ALTO,,QGO,592507.0,2890.0,5.0,201602.0,2039.66458
Y39394000021058,65593649.0,37.8882,-122.545,6041,1282002004,MILL VALLEY SCHOOL AREA TAMAL,A,QGO,612199.0,2297.0,...,6041,1261002011,MILL VALLEY,O,QGO,784444.0,1300.0,2.0,201303.0,1382.667421
Y39394000054683,117484545.0,37.9001,-122.274,6001,4212004007,BERKELEY INCORP,A,QAV,85981.0,1466.0,...,6001,4212001004,BERKELEY INCORP,O,QAV,811687.0,1078.0,2.0,201304.0,441.693361
Y39394000108630,65353580.0,37.775108,-122.48973,6075,478013005,,,,130942.0,3400.0,...,6001,4014002010,OAKLAND INCORP,A,QAV,326197.0,1269.0,2.0,201808.0,19625.242557
Y39394000314991,186863857.0,37.774,-122.26,6001,4279004000,ALAMEDA INCORP,A,QAV,309321.0,1621.0,...,6001,4276002010,ALAMEDA INCORP,O,QAV,205785.0,1128.0,2.0,201706.0,1601.150776


In [33]:
demog_cols = ['PID', 'AGE', 'LOR', 'HOMEOWNERCD', 'EHI', 'PCTB', 'PCTW', 'PCTA', 'PCTH']

demog_dtypes = {col: float if col in ['AGE', 'LOR'] else str for col in demog_cols}

demog = dd.read_csv(
    '/home/data/infutor/NARC3/bay_area_mover*.csv',
    usecols=demog_cols,
    assume_missing=True,
    dtype=demog_dtypes).set_index('PID')

In [34]:
moves_w_demog = ddf.merge(demog, left_index=True, right_index=True)

In [35]:
with ProgressBar():
    moves_w_demog = moves_w_demog.compute()

[########################################] | 100% Completed |  1min 54.3s


In [36]:
num_total_moves = len(moves_w_dists)
num_moves_w_demog = len(moves_w_demog)

In [37]:
print('Matched movers to demographic profiles '
      'for {0} of {1} ({2}%) of relocation records.'.format(
        str(num_moves_w_demog), str(num_total_moves), str(round(100 * num_moves_w_demog / num_total_moves, 1))))

Matched movers to demographic profiles for 2212813 of 3641224 (60.8%) of relocation records.


### Save outputs

In [38]:
moves_w_demog.index.name = 'pid'
moves_w_demog.reset_index(inplace=True)

In [39]:
moves_w_demog.to_csv('../data/moves_w_dists.csv', index=False)