In [1]:
import pandas as pd
import glob
import simplejson as json
import multiprocessing as mp

In [2]:
def parse_dcbgs(row):
    temp = pd.DataFrame.from_dict(json.loads(row['destination_cbgs']), orient = 'index')
    temp.reset_index(inplace = True)
    temp.columns = ['destination_cbg', 'n']
    temp['origin_cbg'] = row['origin_census_block_group']
    temp['date'] = row['date']
    return(temp[['date', 'origin_cbg', 'destination_cbg', 'n']])

In [5]:
def mapper(fname):
    df = pd.read_csv(fname, dtype = {'origin_census_block_group': object})
    df['origin_county'] = df.origin_census_block_group.str.slice(0,5)
    df['date'] = df.date_range_start.str.slice(0,10)
    date = df.iloc[0].date
    
    dyads = pd.concat([parse_dcbgs(row) for _, row in df.iterrows()])
    dyads['origin_county'] = dyads.origin_cbg.str.slice(0, 5)
    dyads['destination_county'] = dyads.destination_cbg.str.slice(0, 5)
    ncbgv = dyads.groupby(['date', 'origin_county']).sum()
    ncbgv.columns = ['ncbgv']

    dyads = dyads.groupby(['date', 'origin_county', 'destination_county']).sum().reset_index()
    dyads.to_csv('/pool001/mfzhao/safegraph/dyadic2019/' + date + '.csv', index = False)
    
    btafh = pd.DataFrame.from_records(df.bucketed_away_from_home_time.map(lambda x: json.loads(x))).fillna(0)
    df['nlt1hafh'] = btafh[['<20', '21-45', '46-60']].sum(axis = 1)
    bdt = pd.DataFrame.from_records(df.bucketed_distance_traveled.map(lambda x: json.loads(x))).fillna(0)
    df['nlt2kmt'] = bdt[['0', '1-1000', '1001-2000']].sum(axis = 1)
    
    bpth = pd.DataFrame.from_records(df.bucketed_percentage_time_home.map(lambda x: json.loads(x))).fillna(0)
    df['npthgt75'] = bpth[['76-100']]
    df['total_home_dwell_time'] = df.device_count * df.mean_home_dwell_time
    df['total_non_home_dwell_time'] = df.device_count * df.mean_non_home_dwell_time
    df['total_distance_traveled_from_home'] = df.device_count * df.mean_distance_traveled_from_home
        
    df = df[['date', 'origin_county', 'device_count', 'completely_home_device_count', 'nlt1hafh', 'npthgt75', 
             'nlt2kmt', 'total_home_dwell_time', 'total_non_home_dwell_time', 'total_distance_traveled_from_home']]
    df = df.groupby(['date', 'origin_county']).sum()        
    df = pd.merge(df, ncbgv, left_index = True, right_index = True)
        
    df['mcbgv'] = df.ncbgv/df.device_count
    df['pchd'] = df.completely_home_device_count/df.device_count
    df['plt1hafh'] = df.nlt1hafh/df.device_count
    df['ppthgt75'] = df.npthgt75/df.device_count
    df['plt2kmt'] = df.nlt2kmt/df.device_count
    df['mhdt'] = df.total_home_dwell_time/df.device_count
    df['mnhdt'] = df.total_non_home_dwell_time/df.device_count
    df['mdtfh'] = df.total_distance_traveled_from_home/df.device_count
        
    df.reset_index(inplace = True)
    return(df[['date', 'origin_county', 'device_count', 'pchd', 'mcbgv', 'plt1hafh', 'ppthgt75', 'plt2kmt',
                'mhdt', 'mnhdt', 'mdtfh',]])

In [None]:
flist = glob.glob('/nfs/sloanlab001/data/SafeGraph/social-distancing/v2/2019/0[1234567]/*/*')
flist.sort()
pool = mp.Pool(processes = 24)
df = pd.concat(pool.map(mapper, flist))
df.to_csv('/pool001/mfzhao/safegraph/us_mobility_2019.csv')

In [None]:
flist = glob.glob('/pool001/mfzhao/safegraph/dyadic2019/*')
flist.sort()
dyads = pd.concat([pd.read_csv(f) for f in flist])
dyads.to_csv('/pool001/mfzhao/safegraph/dyadic2020.csv', index = False)