### Step 2: Data preprocessing

__Step goal__: Prepare data for further usage.

__Step overview__:
1. Load the data;
2. Select the city and prepare the data;
3. Save the output.

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from src.data.census.canada.prepare_ca_data import prepare_ca_data
from src.data.census.usa.prepare_us_data import prepare_us_data

## 1. Canada

#### 1.1 Load the data

In [4]:
%%time
# Load the data

# Select province
province = 'ONTARIO'

directory = "../data/raw/census/canada/"

df = pd.read_csv(directory + f'98-401-X2016044_{province}_English_CSV_data.csv', low_memory=False)
gdf = gpd.read_file('../data/raw/geometry/lda_000b16a_e.shp')

Wall time: 2min 12s


#### 1.2 Prepare the data

In [7]:
city = 'Toronto'
city_geo = '35201588' # Vancouver = 59150701, Toronto = 35201588, Montreal = 24661006
data = prepare_ca_data(df, gdf, city=city)

The number of polygons in the shape file = 3702
The number of polygons in data file = 3702
Total number of variables = 2247


#### 1.3 Save geometry, variable names and data

In [11]:
# Save geometry
geometry = gdf[gdf['CCSNAME'] == city].set_index('DAUID')['geometry']
directory = "../data/processed/geometry/canada/"
if not os.path.exists(directory):
    print(f'Succefully created new directory {directory}')
    os.makedirs(directory)
geometry.to_file(directory + f'{city.lower()}.json', driver='GeoJSON')

In [9]:
# Create a hexgrid


In [10]:
# Save variable names and ids
directory = "../data/interim/census/"
if not os.path.exists(directory):
    print(f'Succefully created new directory {directory}')
    os.makedirs(directory)

data[data['GEO_NAME'] == city_geo][['DIM: Profile of Dissemination Areas (2247)', 'variable_id']].to_csv(directory + f'{city.lower()}_variable_names.csv', index=False)
data = data.drop('DIM: Profile of Dissemination Areas (2247)', axis=1)

In [11]:
# Save the data
directory = "../data/processed/census/canada/"
if not os.path.exists(directory):
    print(f'Succefully created new directory {directory}')
    os.makedirs(directory)

# Transform and save the data
data.pivot_table('Dim: Sex (3): Member ID: [1]: Total - Sex', index='GEO_NAME', columns='variable_id').to_csv(directory + f'{city.lower()}.csv')

## 2. United States

#### 2.1 Load the data

In [87]:
%%time
directory = "../../data/raw/census/united_states/"
city = "san_francisco"
# Load the data
gdf = gpd.read_file(directory + f'{city}.geojson', low_memory=False)

Wall time: 719 ms


#### 2.2 Prepare the data

In [88]:
if city == "san_francisco":   
    # drop rows outside of city limits: 267, 549, 581
    gdf.drop([266, 548, 580], axis = 0, inplace=True)  

if city == "miami":   
    # drop rows outside of city limits: 218, 219
    gdf.drop([218, 219], axis = 0, inplace=True) 
    
if city == "seattle":   
    # drop rows outside of city limits: 218, 219
    gdf.drop([425], axis = 0, inplace=True) 
    
if city == "los_angeles":   
    # drop rows outside of city limits: 218, 219
    gdf.drop([2501], axis = 0, inplace=True) 

In [89]:
data = prepare_us_data(gdf)  

The number of polygons in the shape file = 579
The number of polygons in data file = 579
Total number of variables = 65


#### 2.3 Save geometry and data

In [90]:
# Save geometry
geometry = gdf.set_index('GEOID')['geometry']
directory = "../../data/processed/geometry/united_states/"
if not os.path.exists(directory):
    print(f'Succefully created new directory {directory}')
    os.makedirs(directory)
geometry.to_file(directory + f'{city.lower()}.geojson', driver='GeoJSON')

In [91]:
# Save the data
directory = "../../data/processed/census/united_states/"
if not os.path.exists(directory):
    print(f'Succefully created new directory {directory}')
    os.makedirs(directory)

# Transform and save the data
data.to_csv(directory + f'{city.lower()}.csv')