## Grab SafeGraph data from AWS 

#### code from ryan@safegraph.com

In [1]:
from bs4 import BeautifulSoup as bs
from bs4 import Tag, NavigableString
import re
import requests as r
import pandas as pd
import numpy as np
from datetime import datetime
import csv
import os
import sys
from state_cleaner import *
from selenium import webdriver
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
os.chdir('../..')
os.chdir('CovidMobile')
os.getcwd()

'/Users/samismalling/Documents/mobility-report-data-extractor-master/CovidMobile'

In [10]:
#Configure the aws profile **check the slack for new access keys weekly!
#! aws configure --profile safegraph

/Users/samismalling/Documents/mobility-report-data-extractor-master/CovidMobile/notebooks


In [None]:
#Downloads all current available data social distancing data: *takes a while to run
#! aws s3 sync s3://sg-c19-response/social-distancing/v2/ /Users/samismalling/Documents/mobility-report-data-extractor-master/CovidMobile/SafeGraph_data --profile safegraph

In [3]:
full_df = pd.read_csv('./data/compiled_2020-05-16.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
print(full_df['date'].min(), full_df['date'].max())

2020-02-15 2020-05-09


In [8]:
#Create list of dates to loop through
dates_obj = list(pd.date_range(start=full_df['date'].min(),end=full_df['date'].max()))
dates_slashes = [x.strftime("%Y/%m/%d") for x in dates_obj]
dates_dashes = [x.strftime("%Y-%m-%d") for x in dates_obj]

d_s = dates_slashes[round(len(dates_slashes)/2):]
d_d = dates_dashes[round(len(dates_dashes)/2):]

In [6]:
sdm_df = pd.DataFrame()

In [9]:
#Read in all of the csvs for each date:
for date1, date2 in zip(d_s, d_d):
    df = pd.read_csv("/Users/samismalling/Documents/SafeGraph_data/{}/{}-social-distancing.csv.gz".format(date1,date2))
    df = df[["origin_census_block_group", "date_range_start", "date_range_end", 
                                 "device_count", "completely_home_device_count", "part_time_work_behavior_devices", 
                                 "full_time_work_behavior_devices"]]
    
    df = df.astype({'origin_census_block_group':'str'})
    df['FIPS'] = df.origin_census_block_group.str.slice(start=0, stop=5) # county is the first 5 digits of the CBG
    df['date'] = df.date_range_start.str.slice(start=0, stop=10)
    
    sdm_columns = ['device_count', 'completely_home_device_count','part_time_work_behavior_devices', 'full_time_work_behavior_devices']
    df_by_county = df.groupby(['FIPS', 'date'])[sdm_columns].sum().reset_index()
    
    sdm_df = sdm_df.append(df_by_county)
    print(date1)

2020/03/28
2020/03/29
2020/03/30
2020/03/31
2020/04/01
2020/04/02
2020/04/03
2020/04/04
2020/04/05
2020/04/06
2020/04/07
2020/04/08
2020/04/09
2020/04/10
2020/04/11
2020/04/12
2020/04/13
2020/04/14
2020/04/15
2020/04/16
2020/04/17
2020/04/18
2020/04/19
2020/04/20
2020/04/21
2020/04/22
2020/04/23
2020/04/24
2020/04/25
2020/04/26
2020/04/27
2020/04/28
2020/04/29
2020/04/30
2020/05/01
2020/05/02
2020/05/03
2020/05/04
2020/05/05
2020/05/06
2020/05/07
2020/05/08
2020/05/09


In [10]:
# convert numerical columns
sdm_df= sdm_df.astype({'device_count':'int','completely_home_device_count':'int'})

In [11]:
sdm_df.head()

Unnamed: 0,FIPS,date,device_count,completely_home_device_count,part_time_work_behavior_devices,full_time_work_behavior_devices
0,10001,2020-02-15,12286,3445,676,196
1,10003,2020-02-15,31356,8501,1695,658
2,10005,2020-02-15,14123,3694,834,278
3,10010,2020-02-15,5708,1152,622,180
4,10030,2020-02-15,23838,5358,2407,805


In [12]:
# compute new metrics
sdm_df['leaving_home'] = sdm_df['device_count'] - sdm_df['completely_home_device_count']
sdm_df['pct_leaving_home'] = sdm_df['leaving_home'] / sdm_df['device_count'] 

sdm_df['pct_social_distancing'] = sdm_df['completely_home_device_count']/(sdm_df['device_count']-sdm_df['full_time_work_behavior_devices']-sdm_df['part_time_work_behavior_devices'])
  
sdm_df.head()

Unnamed: 0,FIPS,date,device_count,completely_home_device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,leaving_home,pct_leaving_home,pct_social_distancing
0,10001,2020-02-15,12286,3445,676,196,8841,0.7196,0.301822
1,10003,2020-02-15,31356,8501,1695,658,22855,0.728888,0.293108
2,10005,2020-02-15,14123,3694,834,278,10429,0.738441,0.283914
3,10010,2020-02-15,5708,1152,622,180,4556,0.798178,0.234815
4,10030,2020-02-15,23838,5358,2407,805,18480,0.775233,0.259769


In [15]:
sdm_df = sdm_df.sort_values(by='FIPS')

In [16]:
# Merge with the full data set:
sdm_df = sdm_df.astype({'FIPS':'float64'})
combined_df = full_df.merge(sdm_df,on = ['FIPS'])
combined_df

Unnamed: 0,country_region_code,country_region,state_x,county_x,date_x,retail,food_drugs,park,transit,work,...,first_case_date,_merge,date_y,device_count,completely_home_device_count,part_time_work_behavior_devices,full_time_work_behavior_devices,leaving_home,pct_leaving_home,pct_social_distancing
0,US,United States,Delaware,Kent County,2020-02-15,5.0,4.0,3.0,,0.0,...,,left_only,2020-02-15,12286,3445,676,196,8841,0.719600,0.301822
1,US,United States,Delaware,Kent County,2020-02-15,5.0,4.0,3.0,,0.0,...,,left_only,2020-02-27,12203,2692,1604,1020,9511,0.779399,0.281031
2,US,United States,Delaware,Kent County,2020-02-15,5.0,4.0,3.0,,0.0,...,,left_only,2020-04-27,10037,3854,552,343,6183,0.616021,0.421571
3,US,United States,Delaware,Kent County,2020-02-15,5.0,4.0,3.0,,0.0,...,,left_only,2020-02-28,11216,2522,1446,988,8694,0.775143,0.287178
4,US,United States,Delaware,Kent County,2020-02-15,5.0,4.0,3.0,,0.0,...,,left_only,2020-04-26,9880,4602,230,112,5278,0.534211,0.482491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17179260,US,United States,Wyoming,Weston County,2020-05-08,,,,,-35.0,...,,left_only,2020-05-06,345,129,27,12,216,0.626087,0.421569
17179261,US,United States,Wyoming,Weston County,2020-05-08,,,,,-35.0,...,,left_only,2020-04-17,289,100,22,12,189,0.653979,0.392157
17179262,US,United States,Wyoming,Weston County,2020-05-08,,,,,-35.0,...,,left_only,2020-02-19,326,96,30,29,230,0.705521,0.359551
17179263,US,United States,Wyoming,Weston County,2020-05-08,,,,,-35.0,...,,left_only,2020-02-15,333,80,22,10,253,0.759760,0.265781


In [29]:
sdm_df.to_csv('/Users/samismalling/Documents/SafeGraph_data/sg_mobility_updated_{}.csv'.format(str(datetime.now().date())))

In [None]:
full_df.to_csv('/Users/samismalling/Documents/SafeGraph_data/compiled+sg_{}.csv'.format(str(datetime.now().date())))