# Step 0:
# Data Preproseccing
* 1, Road Network Preprocessing
* 2, Work Commute Data

## 0, Census

In [None]:
import geopandas as gpd

In [None]:
wcensus = gpd.read_file('Data/Tract_2010Census_DP1/Tract_2010Census_DP1.shp')
wcensus.head()

In [None]:
ny = wcensus[wcensus['GEOID10'].str.startswith('36')].sort_values(by=['GEOID10']).reset_index(drop = True)
ny.head()

In [None]:
ny.to_file('Data/NY_census_tract.shp')

In [None]:
df = gpd.read_file('data/Erie_County_Census_Tract.shp')
df.head()

In [None]:
df[:5].to_csv('test_census10.csv')

## 1, Road Networks

In [None]:
!pip install requests

In [None]:
!pip install wget

In [None]:
!pip install zipfile36

### 1.1 Download Road shp

In [None]:
import os
import requests

In [None]:
import wget
url = 'https://www2.census.gov/geo/tiger/TIGER2020/ROADS/'
for i in range(0,124):
#for i in range(0,12):
    if i % 2 != 0:
        add = url + "tl_2020_36" + "%03d" % i + "_roads.zip"
        #print(add)
        wget.download(add)

### 1.2 Extract files from .zip

In [None]:
loc = os.getcwd()
os.listdir(loc)

In [None]:
import os, zipfile

#dir_name = 'NY_Road_County'
extension = ".zip"

#os.chdir(dir_name) # change directory from working dir to dir with files

for item in os.listdir(loc): # loop through items in dir
    if item.endswith(extension): # check for ".zip" extension
        file_name = os.path.abspath(item) # get full path of files
        zip_ref = zipfile.ZipFile(file_name) # create zipfile object
        zip_ref.extractall(dir_name) # extract file to dir
        zip_ref.close() # close file
        os.remove(file_name) # delete zipped file

### 1.3 Read in multiple .shp file and Combine Road

In [None]:
import glob
import os
allFiles = glob.glob(os.path.join ("*.shp"))

In [None]:
len(allFiles)

In [None]:
list_ = []
for file_ in allFiles:
    df = gpd.read_file(file_,index_col=None, header=0)
    print(df.shape)
    list_.append(df)
frame = pd.concat(list_, axis = 0, ignore_index = True)

### 1.4 Clean Raod

Three steps to clean and get the giant connected component from the road shapefile.

- Run GRASS `v.clean.advanced` tools `snap,break,rmdupl,rmsa` with tolerance values `0.0001,0.0,0.0,0.0`, save the result to `cleaned.shp`
- Run GRASS `v.net.components` tool (`weak` or `strong` does not matter since the network is undirected), save the result as `giant_component.csv`
- Using geoPandas combine the two files (shp and csv), filter the roads in the giant component, and save the result as `gcc.shp`:

In [None]:
pwd 

In [None]:
os.getcwd()
#Desktop/Spop_data/Create_Synthetic_Population-master/0_Data_Preprocessing.ipynb
os.chdir('/Users/richardjiang/Desktop/Spop_data/Create_Synthetic_Population-master/') 

In [None]:
#components = pd.read_csv('../nWMDmap2/giant_component.csv', usecols=[0])
#cleaned = gpd.read_file('../nWMDmap2/cleaned.shp')
import geopandas as gpd
import pandas as pd
#os.chdir(owd)

components = pd.read_csv('Data/Road_Clean/gaint_component/giant_component.csv', usecols=[0])
cleaned = gpd.read_file('Data/Road_Clean/cleaned/cleaned.shp')

In [None]:
components.head()

In [None]:
cleaned.head()

In [None]:
cleaned.columns

In [None]:
col_list = ['LINEARID', 'MTFCC', 'geometry']

In [None]:
roads = cleaned.loc[:, col_list].join(components)
roads.head()

In [None]:
#roads[roads.cat == 1610]
#roads = roads[roads.comp == 1610].drop('comp',axis=1)

roads.to_file('Data/ny_road_cleaned.shp')

## 2, Work Commute Data

To get inter-tract commuting data at census-tract level:

- Download the datasets (6*2 = 12 files in total)
- Aggregate them at tract level (originial data is at block level, i.e. more granular)
- Remove unincluded tracts


In [None]:
import pandas as pd
import geopandas as gpd
census = gpd.read_file('data/Erie_County_Census_Tract.shp')
census.head()

In [None]:
census[census.GEOID10 == '36029001900']

In [None]:
# CREATE TRACT LEVEL O-D PAIRS
work_home_in_ny = pd.read_csv('data/ny_od_main_JT00_2019.csv').iloc[:,0:6]
len(work_home_in_ny)
#GEOID: state(2)-county(3)-tract(6): e.g. 09-001-030300
#census = gpd.read_file('../nWMDmap2/censusclip1.shp').set_index('GEOID10') #demographic profiles
#read_workflow = partial(pd.read_csv,usecols=range(6),dtype={0:str,1:str})\
#read in data

In [None]:
#wf = pd.concat([read_workflow(f) for f in glob('../od/*JT00*')]) #workflow
work_home_in_ny['work'] = work_home_in_ny.w_geocode.astype(str).str[:11]
work_home_in_ny['home'] = work_home_in_ny.h_geocode.astype(str).str[:11]
work_home_in_ny.head()

In [None]:
work_home_in_ny[:50]

In [None]:
work_home_in_ny.h_geocode

In [None]:
#od = wf[(wf.work.isin(census.GEOID10)) | (wf.home.isin(census.GEOID10))]
        #.head()
od_in_ny = work_home_in_ny[(work_home_in_ny.work.isin(census.GEOID10))]
print(len(od_in_ny))

In [None]:
od_in_ny.S000.sum()

In [None]:
temp = od_in_ny.loc[:,['work','home','S000']]
    #.groupby(['work']).sum()
temp_group = temp.groupby(['work','home']).sum()
#temp_group.first().to_csv('temp_group.csv')
temp_group.S000.unique()

In [None]:
temp_group.get_group(36029000110)

We are interested in these columns only (ripping off the rest by `usecols=range(6)`):

- S000: Total number of jobs
- SA01: Number of jobs of workers age 29 or younger
- SA02: Number of jobs for workers age 30 to 54
- SA03: Number of jobs for workers age 55 or older

In [None]:
import pandas as pd
out_county = pd.read_csv('data/ny_od_aux_JT00_2019.csv').iloc[:,0:6]
in_county = pd.read_csv('data/ny_od_main_JT00_2019.csv').iloc[:,0:6]

In [None]:
work_home_in_out_ny = pd.concat([out_county, in_county]).reset_index(drop =True)
#wf = pd.concat([read_workflow(f) for f in glob('../od/*JT00*')]) #workflow
work_home_in_out_ny['work'] = work_home_in_out_ny.w_geocode.astype(str).str[:11]
work_home_in_out_ny['home'] = work_home_in_out_ny.h_geocode.astype(str).str[:11]
work_home_in_out_ny.head()

In [None]:
#od = wf[(wf.work.isin(census.GEOID10)) | (wf.home.isin(census.GEOID10))]
        #.head()
od_in_out_ny = work_home_in_out_ny[(work_home_in_out_ny.work.isin(census.GEOID10))]
print(len(od_in_out_ny))

In [None]:
od_in_out_ny[od_in_out_ny.work == od_in_out_ny.home]

In [None]:
od_in_out_ny.S000.sum()

In [None]:
od_in_out_ny.head()

In [None]:
temp = od_in_out_ny.loc[:,['work','home','S000']]
    #.groupby(['work']).sum()
od_final = temp.groupby(['work','home']).sum()

In [None]:
od_final.head()

In [None]:
od_final

In [None]:
len(od_final)

In [None]:
od_final.reset_index().to_csv('erie-tract-od.csv',index=False)

In [None]:
od_final.reset_index().to_csv('test.csv',index=False)

### Commute Flow County

In [None]:
import pandas as pd
import geopandas as gpd
census = gpd.read_file('data/Erie_County_Census_Tract.shp')
census.head()

In [None]:
# CREATE TRACT LEVEL O-D PAIRS
work_home_in_ny = pd.read_csv('data/ny_od_main_JT00_2019.csv').iloc[:,0:6]
len(work_home_in_ny)
#GEOID: state(2)-county(3)-tract(6): e.g. 09-001-030300
#census = gpd.read_file('../nWMDmap2/censusclip1.shp').set_index('GEOID10') #demographic profiles
#read_workflow = partial(pd.read_csv,usecols=range(6),dtype={0:str,1:str})\
#read in data

In [None]:
        #.head()
od_in_ny = work_home_in_ny[(work_home_in_ny.work.isin(census.GEOID10))]
print(len(od_in_ny))