# Initial Configs


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
import pandas as pd
import numpy as np

#import plotly for visualization
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px
import sys
from IPython.core.display import display, HTML
sys.path.append('..')
pyoff.init_notebook_mode()

from bokeh.resources import INLINE
import bokeh.io
from bokeh import *

# Package configs

In [None]:
pd.set_option('display.max_columns', None)
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('plotting.backend', 'pandas_bokeh')
bokeh.io.output_notebook(INLINE)

# Functions

In [None]:
# Print rapido para dimensão do Dataframe
def SZ(df):
    print(f"""
--- Dimensão ---
Linhas:  {df.shape[0]}
Colunas: {df.shape[1]}""")

In [None]:
# Print rapido para dTypes do Dataframe
def DT(df):
    print(f"""
--- DataTypes ---
{df.dtypes}
    """)

# Reading Files

In [None]:
# df_datatran2020 = pd.read_csv("../dataset/datatran2020.csv", delimiter=";", encoding='iso-8859-1')
df_datatran2019 = pd.read_csv("../data/raw/datatran2019.csv", delimiter=";", encoding='iso-8859-1')
df_datatran2018 = pd.read_csv("../data/raw/datatran2018.csv", delimiter=";", encoding='iso-8859-1')
df_datatran2017 = pd.read_csv("../data/raw/datatran2017.csv", delimiter=";", encoding='iso-8859-1')
df_datatran2016 = pd.read_csv("../data/raw/datatran2016.csv", delimiter=";", encoding='iso-8859-1')
df_datatran2015 = pd.read_csv("../data/raw/datatran2015.csv", delimiter=";", encoding='iso-8859-1')

In [None]:
df_datatran = pd.concat([df_datatran2015, df_datatran2016, df_datatran2017, df_datatran2018, df_datatran2019])

df_datatran['data_inversa'] = pd.to_datetime(df_datatran['data_inversa'])

df_datatran[['latitude', 'longitude']] = df_datatran[['latitude', 'longitude']].apply(lambda x: x.str.replace(',','.'))
df_datatran['latitude'] = df_datatran['latitude'].astype('float64')
df_datatran['longitude'] = df_datatran['longitude'].astype('float64')

SZ(df_datatran)
DT(df_datatran)

In [None]:
df_datatran.head()

In [None]:
df_datatran.shape

# Data Processing

Normalização de strings

In [None]:
string_columns = ['dia_semana', 
                  'municipio',
                  'causa_acidente',
                  'tipo_acidente',
                  'classificacao_acidente',
                  'fase_dia',
                  'sentido_via',
                  'condicao_metereologica',
                  'tipo_pista',
                  'tracado_via',
                  'uso_solo',
                  'regional',
                  'delegacia',
                  'uop']

df_datatran[string_columns] = df_datatran[string_columns].apply(
    lambda x: x.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.
    decode('utf-8').str.replace('[^\w\s]', '').str.lower().str.strip())

Transformar data em dado numérico ordinal

In [None]:
min_data = min(df_datatran['data_inversa'])

df_datatran['d'] = (df_datatran['data_inversa'] - min_data)
df_datatran['d'] / pd.Timedelta(1, unit='d')
df_datatran['d'] = df_datatran['d'].astype('timedelta64[D]')+1

Completar nulos

In [None]:
df_datatran[['br','km']] = df_datatran[['br','km']].apply(lambda x: x.astype(str).str.replace(',','.'))
df_datatran[['br','km']].fillna(df_datatran[['br','km']].mode().iloc[0], inplace=True)
df_datatran[['br','km']] = df_datatran[['br','km']].astype(str)

In [None]:
df_datatran[['latitude', 'longitude']] = df_datatran[['latitude', 'longitude']].fillna(df_datatran[['latitude', 'longitude']].median())

In [None]:
df_datatran[string_columns] = df_datatran[string_columns].fillna('')
df_datatran[string_columns] = df_datatran[string_columns].replace('null', '')

Transformar dia da semana em dado categórico numérico

In [None]:
def weekday_process(df, weekday_column):
    return np.select(
        [
            df[weekday_column].str.contains('seg'),
            df[weekday_column].str.contains('ter'),
            df[weekday_column].str.contains('qua'),
            df[weekday_column].str.contains('qui'),
            df[weekday_column].str.contains('sex'),
            df[weekday_column].str.contains('sab'),
            df[weekday_column].str.contains('dom')
        ],
        [
            0,1,2,3,4,5,6
        ],
        ''
    )

In [None]:
df_datatran['dia_semana'] = weekday_process(df_datatran, weekday_column='dia_semana')

Tranformar target em variável binária

In [None]:
df_datatran['contem_vitima_fatal'] = df_datatran.apply(lambda x: 1 if x['mortos'] != 0 else 0, axis=1)

In [None]:
df_datatran.to_csv('../data/processed/dataset_v1.csv', index=False, encoding='iso-8859-1')
df_datatran.to_parquet('../data/processed/dataset_v1.parquet', index=False)

# Exploratory Analysis

In [None]:
df_datatran[['pessoas', 'mortos', 'feridos_leves', 'feridos_graves', 'ilesos', 'ignorados', 'feridos', 'veiculos']].describe()

In [None]:
df_datatran.head()

In [None]:
df_datatran_causa_acidente_count = df_datatran.groupby(['causa_acidente'])['id'].count().reset_index()

df_datatran_causa_acidente_count.set_index('causa_acidente', inplace=True)

In [None]:
df_datatran_causa_acidente_count.sort_values(by='id', ascending=True).plot(kind='barh', figsize=(10,10))