In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.multiprocessing import get
import numpy as np
import pyarrow
import time

### Clean up movers

In [7]:
movers = dd.read_csv(
    '/home/ubuntu/projects/infutor_wip/data/bay_area_movers*.csv',
    dtype={'county_seq_' + str(x): str for x in range(1, 11)},
    assume_missing=True
)

In [8]:
movers['not_valid'] = (
    (movers.addrid_seq_1.notnull() & movers.effdate_seq_1.isna()) |
    (movers.addrid_seq_2.notnull() & movers.effdate_seq_2.isna()) |
    (movers.addrid_seq_3.notnull() & movers.effdate_seq_3.isna()) |
    (movers.addrid_seq_4.notnull() & movers.effdate_seq_4.isna()) |
    (movers.addrid_seq_5.notnull() & movers.effdate_seq_5.isna()) |
    (movers.addrid_seq_6.notnull() & movers.effdate_seq_6.isna()) |
    (movers.addrid_seq_7.notnull() & movers.effdate_seq_7.isna()) |
    (movers.addrid_seq_8.notnull() & movers.effdate_seq_8.isna()) |
    (movers.addrid_seq_9.notnull() & movers.effdate_seq_9.isna()) |
    (movers.addrid_seq_10.notnull() & movers.effdate_seq_10.isna()))

In [9]:
with ProgressBar():
    validated_movers = movers.compute()

[########################################] | 100% Completed | 45.3s


In [10]:
cleaned_movers = validated_movers[validated_movers['not_valid'] == False]

In [11]:
cleaned_movers.to_parquet('../data/cleaned_movers.parquet', engine='pyarrow')

### Process movers wide to long

In [3]:
cleaned_movers = dd.read_parquet('../data/cleaned_movers.parquet', engine='pyarrow')

In [4]:
cleaned_movers = cleaned_movers.repartition(npartitions=10000)

In [5]:
def process_df(df):
    
    sttm = time.time()
    out_cols = ['pid', 'from_addrid', 'to_addrid', 'from_effdate', 'to_effdate', 'from_county', 'to_county', 'seq']
    long_moves = pd.DataFrame(columns=out_cols, dtype=str)
    
    for x in range(1,10):

        from_county_col = 'county_seq_' + str(x)
        to_county_col = 'county_seq_' + str(x + 1)
        from_addrid_col = 'addrid_seq_' + str(x)
        to_addrid_col = 'addrid_seq_' + str(x + 1)
        from_effdate_col = 'effdate_seq_' + str(x)
        to_effdate_col = 'effdate_seq_' + str(x + 1)
    
        tmp = df[[
            'pid_a', from_addrid_col, to_addrid_col, from_effdate_col, 
            to_effdate_col, from_county_col, to_county_col]].copy(deep=True)
        tmp.loc[:, 'seq'] = x
        long_moves = pd.concat((long_moves, tmp.rename(columns=dict(zip(tmp.columns, out_cols)))))

    return long_moves

In [6]:
out_cols = ['pid', 'from_addrid', 'to_addrid', 'from_effdate', 'to_effdate', 'from_county', 'to_county', 'seq']

In [7]:
long_movers = cleaned_movers.map_partitions(process_df, meta=pd.DataFrame(columns=out_cols, dtype=str))

In [None]:
with ProgressBar():
    all_moves = long_movers.compute()

[###################                     ] | 48% Completed |  7min 21.8s

In [9]:
len(all_moves) == len(cleaned_movers) * 9

True

### Drop rows without full to/from data

In [10]:
all_moves = dd.from_pandas(all_moves, npartitions=10000)

In [11]:
moves_not_null = all_moves[all_moves['from_effdate'].notnull() & all_moves['to_effdate'].notnull()]

In [12]:
with ProgressBar():
    moves_not_null = moves_not_null.compute()

[########################################] | 100% Completed | 36.0s


In [13]:
len(moves_not_null)

16713668

### Drop rows where move is between the same address ID

In [14]:
moves_not_dupe = moves_not_null[moves_not_null['from_addrid'] != moves_not_null['to_addrid']]

In [15]:
len(moves_not_dupe)

14768635

### Save results

In [16]:
moves_not_dupe.to_parquet('../data/moves_long.parquet', engine='pyarrow')