# CLEANING GEOJSON: CRITICAL FLOOD AREAS IN RIO DE JANEIRO - IPP (INSTITUTO PEREIRA PASSOS)

In [1]:
cd ../

C:\Users\luisr\Desktop\Repositories\Data Science Projects\Hackaton COR IV - Centro de Operações do RJ\INCUBAÇÃO


In [3]:
import json, requests, pandas as pd, numpy as np
from Modulos.geojson_conversion import geojson_obj

### Load ipp geojson

In [4]:
path_ipp = 'Dados/IPP/Áreas_críticas_de_alagamento.geojson'

ipp = json.loads(open(path_ipp, 'rb').read())

print('Features:', len(ipp['features']))
print('Keys:', list(ipp.keys()))
print('Header:', {**ipp, 'features': []})

Features: 5397
Keys: ['type', 'crs', 'features']
Header: {'type': 'FeatureCollection', 'crs': {'type': 'name', 'properties': {'name': 'EPSG:4326'}}, 'features': []}


## Data cleaning

#### Drop crs notation

In [5]:
del ipp['crs']

#### Convert to dataframe

In [6]:
metadata = {**ipp, 'features': []}

def geojson_to_df(geojson):
    feature_metadata = pd.DataFrame(list(map(lambda feat: {'metadata': {key: feat[key] for key in feat.keys() if key != 'properties'}}, geojson['features'])))
    df = pd.DataFrame(list(map(lambda feat: feat['properties'], geojson['features'])))
    return df.join(feature_metadata)

ipp_df = geojson_to_df(ipp)
ipp_df.head()

Unnamed: 0,oid,name,popupinfo,objectid,folderpath,db_hrnsh.hsu_zf8og.áreas_críticas_de_alagamento_pds_rio_aguas_nov19_s.entity,layer,color,linetype,elevation,...,id_unico,pop,dom_ocu,peso_cetrio,peso_pdmap,peso_pds,peso_comando,soma_peso,target_fid,metadata
0,1.0,15,Rua Barão de Tefé (Saúde - S. Centro) - Rua Vi...,,,,,,,,...,,,,,,,,,,"{'type': 'Feature', 'id': 1, 'geometry': {'typ..."
1,2.0,173,Av. Abelardo Bueno - PL (Parque Olímpico - S. ...,,,,,,,,...,,,,,,,,,,"{'type': 'Feature', 'id': 2, 'geometry': {'typ..."
2,3.0,177,Av. Abelardo Bueno - PL (Parque Olímpico - S. ...,,,,,,,,...,,,,,,,,,,"{'type': 'Feature', 'id': 3, 'geometry': {'typ..."
3,4.0,219,Estr. do Catonho (Taquara - S. Sulacap) - Estr...,,,,,,,,...,,,,,,,,,,"{'type': 'Feature', 'id': 4, 'geometry': {'typ..."
4,5.0,231,Estr. dos Bandeirantes (Merck - S. Recreio) - ...,,,,,,,,...,,,,,,,,,,"{'type': 'Feature', 'id': 5, 'geometry': {'typ..."


#### Columns

In [7]:
ipp_df.columns

Index(['oid', 'name', 'popupinfo', 'objectid', 'folderpath',
       'db_hrnsh.hsu_zf8og.áreas_críticas_de_alagamento_pds_rio_aguas_nov19_s.entity',
       'layer', 'color', 'linetype', 'elevation', 'linewt', 'refname',
       'id_unico', 'pop', 'dom_ocu', 'peso_cetrio', 'peso_pdmap', 'peso_pds',
       'peso_comando', 'soma_peso', 'target_fid', 'metadata'],
      dtype='object')

#### Drop features with empty coordinates

In [8]:
not_point = lambda feat: feat['type'] != 'Point'
not_empty = lambda feat: len(feat['geometry']['coordinates']) != 0

ipp_df = ipp_df[ipp_df['metadata'].apply(not_point) & ipp_df['metadata'].apply(not_empty)]

print('Features:', len(ipp_df))

Features: 5137


#### Get features by geometry type

In [9]:
geometries = np.unique(list(map(lambda feat: feat['geometry']['type'], ipp['features'])))

dfs = {geometry: ipp_df[ipp_df['metadata'].apply(lambda data: data['geometry']['type'] == geometry)] for geometry in geometries}

display(pd.Series({key: len(df) for key, df in dfs.items()}, name='geometry type count'))

LineString      1246
MultiPolygon      14
Point           1301
Polygon         2576
Name: geometry type count, dtype: int64

#### Columns by geometry type

In [10]:
print('Empty columns:', {key: (df.isna().sum() != 0).sum() for key, df in dfs.items()}); print()
print('Columns left:\n')
display({key: list(df.dropna(how='all', axis=1).columns) for key, df in dfs.items()})

Empty columns: {'LineString': 21, 'MultiPolygon': 18, 'Point': 19, 'Polygon': 20}

Columns left:



{'LineString': ['oid',
  'name',
  'popupinfo',
  'objectid',
  'db_hrnsh.hsu_zf8og.áreas_críticas_de_alagamento_pds_rio_aguas_nov19_s.entity',
  'layer',
  'color',
  'linetype',
  'elevation',
  'linewt',
  'refname',
  'metadata'],
 'MultiPolygon': ['name', 'objectid', 'folderpath', 'metadata'],
 'Point': ['oid', 'name', 'popupinfo', 'metadata'],
 'Polygon': ['name',
  'objectid',
  'folderpath',
  'id_unico',
  'pop',
  'dom_ocu',
  'peso_cetrio',
  'peso_pdmap',
  'peso_pds',
  'peso_comando',
  'soma_peso',
  'target_fid',
  'metadata']}

#### Break LineString dataframe into the two original collections

In [11]:
lines = dfs['LineString']

print('oid field empty:', lines['oid'].isna().sum())
print('objectid field not empty:', lines['objectid'].notna().sum())

dfs['drenagem_LineString'] = lines[lines['objectid'].notna()]
dfs['trechos_cetrio_LineString'] = lines[lines['oid'].notna()]

del dfs['LineString']

oid field empty: 153
objectid field not empty: 153


#### New collections' columns

In [12]:
new_coll = ['drenagem_LineString', 'trechos_cetrio_LineString']

print('Empty columns:', {key: (dfs[key].isna().sum() != 0).sum() for key in new_coll}); print()
print('Columns left:\n')
display({key: list(dfs[key].dropna(how='all', axis=1).columns) for key in new_coll})

Empty columns: {'drenagem_LineString': 13, 'trechos_cetrio_LineString': 18}

Columns left:



{'drenagem_LineString': ['objectid',
  'db_hrnsh.hsu_zf8og.áreas_críticas_de_alagamento_pds_rio_aguas_nov19_s.entity',
  'layer',
  'color',
  'linetype',
  'elevation',
  'linewt',
  'refname',
  'metadata'],
 'trechos_cetrio_LineString': ['oid', 'name', 'popupinfo', 'metadata']}

### Convert back to geojson

In [13]:
def df_to_geojson(df, metadata='metadata'):   
    features = []
    for idx, row in df.iterrows():
        features.append({**row[metadata], 'properties': row.drop(metadata).dropna().to_dict()})
    return geojson_obj(features)

ipp_geojson = df_to_geojson(ipp_df, metadata='metadata')
geometries = {key: df_to_geojson(df) for key, df in dfs.items()} # drops empty property fields

## Save clean data

In [14]:
ipp_json = json.dumps(ipp_geojson)

path_ipp = 'Dados/IPP/areas_alagamento.geojson'    
with open(path_ipp, 'w') as file:
    file.write(ipp_json)

## Save by geometry type

In [15]:
path_ipp = 'Dados/IPP/Geometrias/areas_alagamento.geojson'    

for key, geojson in geometries.items():
    file_path, ext = path_ipp.split('.')
    path = file_path + '_' + key + '.' + ext
    with open(path, 'w') as file:
        file.write(json.dumps(geojson))

## Data exploration

#### Point categories

In [251]:
dfs['Point']['name'].value_counts()

Bolsão d'água em via       1251
Alagamento                   34
Alagamentos e enchentes      16
Name: name, dtype: int64

#### Feature size

In [247]:
feat_size = pd.DataFrame(list(map(lambda feat: {'type': feat['geometry']['type'], 'size': len(feat['geometry']['coordinates']), 'poly_size': len(feat['geometry']['coordinates'][0]) if feat['geometry']['type'] == 'Polygon' else np.nan}, ipp['features'])))

display(feat_size['size'].value_counts().sort_index().to_frame('size_count').T)
display(feat_size.groupby('type')['size'].value_counts().sort_index().to_frame('count').T)
display(feat_size.groupby('type')['poly_size'].value_counts().sort_index().to_frame('count').T)

Unnamed: 0,0,1,2,34,38,45,51,52,53,54,...,501,511,520,527,555,590,603,672,706,718
size_count,260,2576,1315,1,1,1,1,2,2,1,...,1,1,1,1,1,1,1,1,1,1


type,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,LineString,MultiPolygon,Point,Point,Polygon
size,34,38,45,51,52,53,54,55,56,57,...,555,590,603,672,706,718,2,0,2,1
count,1,1,1,1,2,2,1,4,2,2,...,1,1,1,1,1,1,14,260,1301,2576


type,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon,Polygon
poly_size,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,...,118.0,121.0,128.0,131.0,140.0,172.0,173.0,180.0,496.0,2996.0
count,4,2174,21,14,18,15,27,21,11,13,...,1,1,1,1,1,4,4,1,1,1
