In [1]:
import pandas as pd

# Preprocessing
from sklearn.preprocessing import StandardScaler

In [2]:
accidents_processed_df = pd.read_csv('../data/processed/accidents_categorized.csv', sep=';',
                                    index_col='Unnamed: 0')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
list(accidents_processed_df.columns)

['km',
 'ano',
 'ilesos',
 'data',
 'feridos_leves',
 'feridos_graves',
 'mortos',
 'pessoas',
 'veiculos',
 'infracoes',
 'ignorados',
 'br',
 'agressão externa',
 'avarias e/ou desgaste excessivo no pneu',
 'carga excessiva e/ou mal acondicionada',
 'condutor dormindo',
 'defeito mecânico em veículo',
 'defeito mecânico no veículo',
 'defeito na via',
 'deficiência ou não acionamento do sistema de iluminação/sinalização do veículo',
 'desobediência à sinalização',
 'desobediência às normas de trânsito pelo condutor',
 'desobediência às normas de trânsito pelo pedestre',
 'dormindo',
 'falta de atenção',
 'falta de atenção do pedestre',
 'falta de atenção à condução',
 'fenômenos da natureza',
 'ingestão de substâncias psicoativas',
 'ingestão de álcool',
 'ingestão de álcool e/ou substâncias psicoativas pelo pedestre',
 'mal súbito',
 'não guardar distância de segurança',
 'objeto estático sobre o leito carroçável',
 'outras',
 'pista escorregadia',
 'restrição de visibilidade',
 'si

## Scaling numeric data

In [4]:
numeric = ['km','br','ignorados','pessoas', 'feridos_leves',
           'infracoes', 'ilesos','feridos_graves','mortos', 'veiculos']
categorical = list(set(accidents_processed_df.columns) - set(numeric) - set(['data', 'ano', 'classificacao_acidente']))

In [5]:
accidents_processed_df[numeric].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35804 entries, 0 to 35803
Data columns (total 10 columns):
km                35804 non-null object
br                35804 non-null float64
ignorados         35804 non-null int64
pessoas           35804 non-null int64
feridos_leves     35804 non-null int64
infracoes         35804 non-null int64
ilesos            35804 non-null int64
feridos_graves    35804 non-null int64
mortos            35804 non-null int64
veiculos          35804 non-null int64
dtypes: float64(1), int64(8), object(1)
memory usage: 3.0+ MB


In [6]:
accidents_processed_df.km = accidents_processed_df.km.astype(str)
accidents_processed_df.km.unique()

array(['75.0', '284.2', '82.1', ..., '30,2', '36,2', '179,9'],
      dtype=object)

In [7]:
# Transform km to int
accidents_processed_df.km = accidents_processed_df.km.str.replace(',', '.').astype(float)

In [8]:
scaler = StandardScaler()
data = scaler.fit_transform(accidents_processed_df[numeric].values)

In [9]:
data[:,:]

array([[-4.33686444e-01, -7.83859732e-01, -3.53698714e-01, ...,
        -3.67393405e-01, -1.98963164e-01, -1.34726030e+00],
       [ 2.49746721e+00,  9.79874593e-01, -3.53698714e-01, ...,
         1.53817714e+00, -1.98963164e-01, -1.34726030e+00],
       [-3.34206563e-01, -7.83859732e-01, -3.53698714e-01, ...,
        -3.67393405e-01, -1.98963164e-01, -1.34726030e+00],
       ...,
       [-9.04108005e-02, -7.83859732e-01, -3.53698714e-01, ...,
        -3.67393405e-01, -1.98963164e-01,  1.77596114e-01],
       [ 6.62329402e-04, -7.83859732e-01, -3.53698714e-01, ...,
        -3.67393405e-01, -1.98963164e-01,  1.77596114e-01],
       [ 1.03609376e+00,  9.79874593e-01, -3.53698714e-01, ...,
        -3.67393405e-01, -1.98963164e-01, -1.34726030e+00]])

In [10]:
# transform to df agagin
data_df = pd.DataFrame(data)
data_df.columns = numeric

# add another data
final_df = accidents_processed_df[categorical].join(data_df)

In [11]:
final_df.head()

Unnamed: 0,sol,falta de atenção,colisão com objeto fixo,defeito na via,falta de atenção à condução,retorno regulamentado,atropelamento de pessoa,nevoeiro/neblina,objeto estático sobre o leito carroçável,derramamento de carga,...,km,br,ignorados,pessoas,feridos_leves,infracoes,ilesos,feridos_graves,mortos,veiculos
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.433686,-0.78386,-0.353699,-1.135622,-0.533748,-0.113957,-0.391938,-0.367393,-0.198963,-1.34726
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.497467,0.979875,-0.353699,-1.135622,-0.533748,-0.113957,-1.366516,1.538177,-0.198963,-1.34726
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.334207,-0.78386,-0.353699,-0.179393,-0.533748,-0.113957,0.58264,-0.367393,-0.198963,-1.34726
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.046899,-0.78386,-0.353699,0.776836,-0.533748,-0.113957,1.557218,-0.367393,-0.198963,1.702453
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.01095,0.979875,-0.353699,-0.179393,-0.533748,-0.113957,0.58264,-0.367393,-0.198963,0.177596


In [12]:
final_df.tail()

Unnamed: 0,sol,falta de atenção,colisão com objeto fixo,defeito na via,falta de atenção à condução,retorno regulamentado,atropelamento de pessoa,nevoeiro/neblina,objeto estático sobre o leito carroçável,derramamento de carga,...,km,br,ignorados,pessoas,feridos_leves,infracoes,ilesos,feridos_graves,mortos,veiculos
35799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.977323,0.979875,-0.353699,-0.179393,2.062409,-0.113957,-1.366516,-0.367393,-0.198963,0.177596
35800,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.055383,-0.78386,-0.353699,1.733065,2.062409,-0.113957,0.58264,-0.367393,-0.198963,1.702453
35801,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.090411,-0.78386,-0.353699,-0.179393,0.764331,-0.113957,-0.391938,-0.367393,-0.198963,0.177596
35802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000662,-0.78386,-0.353699,-0.179393,0.764331,-0.113957,-0.391938,-0.367393,-0.198963,0.177596
35803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.036094,0.979875,-0.353699,-1.135622,0.764331,-0.113957,-1.366516,-0.367393,-0.198963,-1.34726


In [13]:
final_df.info(max_cols=102)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35804 entries, 0 to 35803
Data columns (total 98 columns):
sol                                                                               35804 non-null float64
falta de atenção                                                                  35804 non-null float64
colisão com objeto fixo                                                           35804 non-null float64
defeito na via                                                                    35804 non-null float64
falta de atenção à condução                                                       35804 non-null float64
retorno regulamentado                                                             35804 non-null float64
atropelamento de pessoa                                                           35804 non-null float64
nevoeiro/neblina                                                                  35804 non-null float64
objeto estático sobre o leito carroçável           

In [14]:
final_df.to_csv('../data/processed/accidents_dataset.csv', sep=';')