# Environment Setting

In [1]:
!pip install 'Fiona==1.8.18'
!pip install 'Shapely==1.7.1'
!pip install 'pyproj==3.0.0.post1'
!pip install folium


Collecting Fiona==1.8.18
  Downloading Fiona-1.8.18-cp36-cp36m-manylinux1_x86_64.whl (14.8 MB)
[K     |████████████████████████████████| 14.8 MB 20.7 MB/s eta 0:00:01
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Installing collected packages: munch, cligj, click-plugins, Fiona
Successfully installed Fiona-1.8.18 click-plugins-1.1.1 cligj-0.7.2 munch-2.5.0
Collecting Shapely==1.7.1
  Downloading Shapely-1.7.1-cp36-cp36m-manylinux1_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 18.1 MB/s eta 0:00:01
[?25hInstalling collected packages: Shapely
Successfully installed Shapely-1.7.1
Collecting pyproj==3.0.0.post1
  Downloading pyproj-3.0.0.post1-cp36-cp36m-manylinux2010_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 13.9 MB/s eta 0:00:01
Installing coll

In [1]:
import boto3
from branca.colormap import linear
import fiona
import folium
import pandas as pd
import numpy as np
from pyproj import Proj, transform
import os


import warnings
warnings.filterwarnings('ignore')

In [2]:
from shapely.geometry import shape

In [3]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("..")

from src.utils import S3Utils, athena_to_pandas
from src.pipelines.shapefile_processor import ItalianCensusAreas

# Data Sources

Data for each italian region was downloaded from https://www.istat.it/it/archivio/104317 and saved to my personal S3. The data refers to 2011 census. 

I download data at individual "cella censuria" level, this is the most granular piece of geographical information made available within an administrative area. Those are usually towns and villages. Which in turn are part of on the 20 italian regions. 

Istat mades this data available in a set of files for each region. Therefore for each of those I downloaded:

1. {region_code}_indicatori_2011_sezioni.csv'. Which are the "celle censuarie" for each region + a set of features.
2. '{region_code}_11_WGS84.zip. The shapefiles in WGS84 format

### Process Census Geographies

In [4]:
%%time

# Input files

S3 = S3Utils('gimi-data', region = 'eu-south-1')
directory = 'in/italy/census-areas/2011'
paths = S3.bucket_content_keys(directory = directory)

CPU times: user 160 ms, sys: 21 ms, total: 181 ms
Wall time: 341 ms


In [None]:
%%time


for path in paths[7:]:
    
    if path.split(".")[-1] == 'zip':
        
        s3bucket = 'gimi-data'
        input_path = path 
        output_path = 'out/italy/census-areas-2011/'
        
        # Download and unzip file
        fname = path.split('/')[-1]
        fname_folder = fname.split('.')[0]
        S3.download_file(path, fname)
        
        # Unzip using a Bash command
        !unzip {fname}
        %cd {fname_folder}
    
        shapes = fiona.open("{}.shp".format(fname_folder))
        
        # After loading data in memory for processing remove from local
        %cd ..
        !rm -rf {fname}
        !rm -rf {fname_folder}
        
        # Convert shapes to a dataframe and combine with features
        sezioni = pd.DataFrame(shapes)
        
        # Convert Geometries
        print('Start WSG to Lat Long geometry conversion')
        ic = ItalianCensusAreas()
        shapes_df = ic.get_section_features(sezioni)
        
        # Drop columns not be load to db
        cols_to_drop = ['properties', 'geometry', 'geometry_ll', 'centroid_ll']
        shapes_df = shapes_df.drop(cols_to_drop, 1)
        
        # Create ouptup parquet file
        reg = path.split("/")[-1].split("_")[0]
        fname = "region={}.parquet".format(reg)
        shapes_df.to_parquet(fname)
        
        # Save to S3
        outpath = 'out/italy/census-areas-2011/'
        S3 = S3Utils('gimi-data', region = 'eu-south-1')
        S3.upload_file(fname, os.path.join(outpath, fname))
        os.remove(fname)
        
    else:
        pass


Archive:  R07_11_WGS84.zip
  inflating: R07_11_WGS84/R07_11.xls  
  inflating: R07_11_WGS84/R07_11_WGS84.dbf  
  inflating: R07_11_WGS84/R07_11_WGS84.prj  
  inflating: R07_11_WGS84/R07_11_WGS84.shp  
  inflating: R07_11_WGS84/R07_11_WGS84.shx  
/home/ec2-user/SageMaker/gimi/notebooks/R07_11_WGS84
/home/ec2-user/SageMaker/gimi/notebooks
Start WSG to Lat Long geometry conversion
0
1000
2000
3000
4000
5000
6000
7000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000


# Functions

# Save Data to S3

In [8]:
import os
import boto3
import pandas as pd

outdir = 'out'
outfiles = os.listdir(outdir)

In [68]:
csv_files = list(filter(lambda x: x.find('.csv') != -1, outfiles))

bucket = "istat-sezioni"

temps = []

for f in csv_files:
    print(f)
    
    temp = pd.read_csv(os.path.join(outdir, f))
    temp['centoid_lat'] = [float(x.split(',')[0].split('(')[1]) for x in temp['centroid_ll']]
    temp['centoid_long'] = [float(x.split(',')[1].split(')')[0].lstrip()) for x in temp['centroid_ll']]
    temp = temp.drop(['geometry', 'properties', 'centroid_ll'], 1)
    
    temps.append(temp)


fdf = pd.concat(temps)

R16_istat_census.csv
R09_istat_census.csv
R19_istat_census.csv
R17_istat_census.csv
R07_istat_census.csv
R10_istat_census.csv
R06_istat_census.csv
R02_istat_census.csv
R05_istat_census.csv
R11_istat_census.csv
R08_istat_census.csv
R15_istat_census.csv
R20_istat_census.csv
R04_istat_census.csv
R12_istat_census.csv
R18_istat_census.csv
R03_istat_census.csv
R14_istat_census.csv
R13_istat_census.csv
R01_istat_census.csv


In [69]:
# Save file sezioni
fname = 'sezioni-istat-2011.csv'

fdf.to_csv(fname, index=False)

s3_client = boto3.client('s3')
s3_client.upload_file(fname, 
                      bucket, 
                      'out/sezioni_istat/{}'.format(fname))

In [18]:
# Save pickeled files
csv_files = list(filter(lambda x: x.find('.pickle') != -1, outfiles))

bucket = "istat-sezioni"

for f in csv_files:
    print(f)
    s3_client = boto3.client('s3')
    s3_client.upload_file(os.path.join(outdir, f), 
                          bucket, 
                          'out/pickle/{}'.format(f))

R14_istat_census.pickle
R15_istat_census.pickle
R13_istat_census.pickle
R20_istat_census.pickle
R08_istat_census.pickle
R06_istat_census.pickle
R19_istat_census.pickle
R16_istat_census.pickle
R10_istat_census.pickle
R02_istat_census.pickle
R03_istat_census.pickle
R05_istat_census.pickle
R07_istat_census.pickle
R17_istat_census.pickle
R12_istat_census.pickle
R01_istat_census.pickle
R18_istat_census.pickle
R04_istat_census.pickle
R09_istat_census.pickle
R11_istat_census.pickle


In [43]:
# Save variables definition

fname = 'docs/tracciato_2011_sezioni.csv'

schema = pd.read_csv(fname, sep = ';', encoding = 'iso-8859-1')
schema.to_csv('tracciato_2011_sezioni.csv', encoding = 'utf-8', index = False)

s3_client = boto3.client('s3')
s3_client.upload_file('tracciato_2011_sezioni.csv', 
                      bucket, 
                      'out/schema_sezioni/tracciato_2011_sezioni.csv'.format(fname))