# Initial Configs


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
import pandas as pd
import numpy as np

import re
import matplotlib.pyplot as plt

import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px
import sys
from IPython.core.display import display, HTML
sys.path.append('..')
pyoff.init_notebook_mode()

from os import listdir
from os.path import isfile, join

from tqdm import tqdm_notebook as tqdm

from bokeh.resources import INLINE
import bokeh.io
from bokeh import *

# In-House library
from src.data import make_dataset as md
from src.features import build_features as bf
from src.features import setup as st

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

# Package configs

In [None]:
pd.set_option('display.max_columns', None)
display(HTML("<style>.container { width:100% !important; }</style>"))
# pd.set_option('plotting.backend', 'pandas_bokeh')
bokeh.io.output_notebook(INLINE)

# Functions

In [None]:
# Print rapido para dimensão do Dataframe
def SZ(df):
    print(f"""
--- Dimensão ---
Linhas:  {df.shape[0]}
Colunas: {df.shape[1]}""")

In [None]:
# Print rapido para dTypes do Dataframe
def DT(df):
    print(f"""
--- DataTypes ---
{df.dtypes}
    """)

# Make Dataset

In [None]:
# Calling MakeDataset class and parsing parameters
dataset = md.MakeDataset(train_date_range=('2017-01-01','2019-12-31')
                        ,test_date_range=('2020-01-01','2020-12-31')
                        ,verbose=True)

# Running the make method to download and preprocess all datasets
dataset.make()

In [None]:
%%time
# Calling BuildFeatures class and parsing parameters
builder = bf.BuildFeatures(dataset=dataset, 
                           train_date_range=('2017-01-01','2019-12-31'), 
                           test_date_range=('2020-01-01','2020-12-31'),
                           verbose=True)

# Running the build_features method to prepare all features
builder.build_features()

In [None]:
setup = st.PrepareDataset(dataset=builder.dataset,
                          test_date_range=('2020-01-01','2020-12-31'))

In [None]:
%%time
cols_dummie = ['dia_semana', 
               'uf', 
               'fase_dia', 
               'sentido_via',
               'condicao_metereologica', 
               'tipo_pista', 
               'tracado_via', 
               'uso_solo',
]

categorical_features = ['dia_semana', 
                        'uf', 
                        'fase_dia', 
                        'sentido_via',
                        'condicao_metereologica', 
                        'tipo_pista', 
                        'tracado_via', 
                        'uso_solo',
                        'em_janela_feriado'
]

numerical_features = ['pessoas', 
                      'risco', 
                      'risco_morte', 
#                       'coordenada_x', 
#                       'coordenada_y', 
#                       'coordenada_z'
]

setup.setup(target_variable='Target',
            categorical_features=categorical_features,
            numerical_features=numerical_features,
            indices=['id'],
            remove_outliers=True,
            remove_outliers_method='pca',
            normalize=True,
            normalize_method='zscore',
            fix_imbalance=False,
            fix_imbalance_method='SMOTENC',
            dummies=cols_dummie,
            )

# Export data

In [None]:
SZ(setup.dataset_train)

In [None]:
SZ(setup.dataset_validation)

In [None]:
SZ(setup.dataset_test)

In [None]:
# Get last saved version
dataset_versions_path = '../data/processed'
dataset_versions_list = [f for f in listdir(dataset_versions_path) if isfile(join(dataset_versions_path, f))]

last_train_dataset = None
last_version = 0
for file_name in dataset_versions_list:
    if 'dataset_' in file_name:
        last_train_dataset = file_name

if last_train_dataset:
    last_version = [int(s) for s in re.findall(r'\d+',last_train_dataset)][0]

print(f'New version will be saved as: v{last_version + 1}')

In [None]:
setup.dataset_train.to_parquet(f'../data/processed/dataset_train_v{last_version + 1}.parquet', index=False)
setup.dataset_validation.to_parquet(f'../data/processed/dataset_validation_v{last_version + 1}.parquet', index=False)
setup.dataset_test.to_parquet(f'../data/processed/dataset_test_v{last_version + 1}.parquet', index=False)