# Initial Configs


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

#import plotly for visualization
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px
import sys
from IPython.core.display import display, HTML
sys.path.append('..')
pyoff.init_notebook_mode()

import os
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
import json

import geopandas as gpd
from shapely.geometry.polygon import Polygon
from shapely.geometry import shape,Point

from bokeh.resources import INLINE
import bokeh.io
from bokeh import *

# Package configs

In [None]:
pd.set_option('display.max_columns', None)
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('plotting.backend', 'pandas_bokeh')
bokeh.io.output_notebook(INLINE)

# Functions

In [None]:
# Print rapido para dimensão do Dataframe
def SZ(df):
    print(f"""
--- Dimensão ---
Linhas:  {df.shape[0]}
Colunas: {df.shape[1]}""")

In [None]:
# Print rapido para dTypes do Dataframe
def DT(df):
    print(f"""
--- DataTypes ---
{df.dtypes}
    """)

# Read Files

In [None]:
years = range(2009,2021)

list_datatran = []

for year in years:
    list_datatran.append(pd.read_csv(f"../data/raw/datatran{year}.csv", delimiter=";", encoding='iso-8859-1'))
    
df_datatran = pd.concat(list_datatran)

df_datatran[['latitude', 'longitude']] = df_datatran[['latitude', 'longitude']].apply(lambda x: x.str.replace(',','.'))
df_datatran['latitude'] = df_datatran['latitude'].astype('float64')
df_datatran['longitude'] = df_datatran['longitude'].astype('float64')

SZ(df_datatran)
DT(df_datatran)

# Transform KML to Dataframe

In [None]:
ufs = ['AC', 'AL', 'AM', 'AP', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG', 'MS', 'MT', 'PA', 'PB', 'PE', 'PI', 'PR', 'RJ', 'RN', 'RO', 'RR', 'RS', 'SC', 'SE', 'SP', 'TO']

In [None]:
df_estados = pd.DataFrame(columns=['uf', 'latitude', 'longitude'])

In [None]:
for uf in ufs:
    with open(f'../data/raw/{uf}.kml', 'r', encoding="utf8") as f:
            soup = BeautifulSoup(f, 'html.parser')
            all_coordinates = soup.find_all("coordinates")
            for item in all_coordinates:
                coordinates = item.text.replace('\n', '').replace('\t','').split(' ')
                latitude = []
                longitude = []
                for lat_long in coordinates:
                        lat = lat_long.split(",")[0]
                        long = lat_long.split(",")[1]
                        latitude.append(float(lat))
                        longitude.append(float(long))

            df_this_uf = pd.DataFrame({'uf':[uf], 'latitude':[latitude], 'longitude':[longitude]})
            df_estados = pd.concat([df_estados, df_this_uf])

# Find UF for each accident

Transform uf coordinates in a polygon

In [None]:
def create_polygon(x):
    return Polygon(list(zip(list(x['latitude']), list(x['longitude']))))

df_estados['poly'] = df_estados.apply(create_polygon, axis=1)
df_estados_geo = gpd.GeoDataFrame(df_estados, geometry=df_estados.poly)

df_estados_geo['area'] = df_estados_geo['geometry'].area

df_estados_geo = df_estados_geo[['uf','poly','geometry','area']]
df_estados_geo = df_estados_geo.set_index('uf')

In [None]:
df_estados_geo[df_estados_geo.index=='MG']['geometry'].plot()

In [None]:
# Remove instances without latitude and longitude
df_datatran_coord = df_datatran.dropna(subset=['latitude', 'longitude'])

In [None]:
df_datatran_geo = gpd.GeoDataFrame(df_datatran_coord, geometry=gpd.points_from_xy(df_datatran_coord['longitude'], df_datatran_coord['latitude']))

In [None]:
%%time
df_within = gpd.sjoin(df_datatran_geo, df_estados_geo, op='within')

In [None]:
df_within = df_within[['id', 'latitude', 'longitude', 'index_right', 'poly', 'area']]
df_within.rename(columns={'index_right':'uf'}, inplace=True)

In [None]:
# If an accident is in more than 1 uf, keep the one with the largest area
dfWithinGeoUnique = df_within.groupby(['id'])['area'].min().reset_index()

dfWithinGeoUniqueInfos = dfWithinGeoUnique.merge(df_within, how='left', on=['id', 'area'])

dfFinal = dfWithinGeoUniqueInfos[~dfWithinGeoUniqueInfos['area'].isna()]

keep_cols = ['id', 'latitude', 'longitude', 'uf']

In [None]:
dfFinal = dfFinal[keep_cols].drop_duplicates()
SZ(dfFinal)
DT(dfFinal)

In [None]:
dfFinal.rename(columns={'uf':'uf_kml'}, inplace=True)

In [None]:
df_datatran = df_datatran.merge(dfFinal[['id', 'uf_kml']], how='left', on='id')

In [None]:
df_datatran[(~df_datatran['latitude'].isna()) & (~df_datatran['longitude'].isna()) & (df_datatran['uf'] == 'PE')]['uf_kml'] = 'PE'

In [None]:
# Se o acidente não se encaixou em nenhum das uf, anular a latitude e longitude dele, pois está incorreta
df_datatran[df_datatran['uf_kml'].isna()][['latitude', 'longitude']] = [np.nan, np.nan]

In [None]:
del df_datatran['uf_kml']

In [None]:
df_datatran.to_csv('../data/processed/df_datatran_2009_2020.csv', index=False)

In [None]:
#plot recency
plot_data = [
    go.Histogram(
        x=df_datatran_geo[~df_datatran_geo['id'].isin(dfFinal['id'])]['uf']
    )
]

plot_layout = go.Layout(
        title='POC Age [Days] Histogram'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)