## Previsao de Embarque 
- Simples 
- Sem Serie Temporal
- Regressao Linear
- Random Forest
- Com Grid Search
- Bagging, Stacking e Boosting

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_regression

import warnings

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
#from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import RandomizedSearchCV
from IPython.display import display
from pprint import pprint
from math import sqrt

#from shutil import unpack_archive
#unpack_archive('./df_input.zip', './')

## Filtro de Onibus e definicoes de features/target

In [2]:
# Reading Data file (geolocalized)
data = pd.read_csv('./df_input.csv', sep=';', delimiter=';')

busline_filter = 41
data_model = data.copy()

#d_ano,  estao disponiveis para inserir once eu tive mais de 1 ano de dados
feature_names = [
    'hour_sin', 'hour_cos', 
    'd_mes', 'd_ano', 'mes', 'semana_do_mes', 
    'domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado', 
    'feriado', 'vespera_feriado']

target = 'validations_per_hour'

In [3]:
day_of_week_translator = {
    0: "Domingo",
    1: "Segunda",
    2: "Terca",
    3: "Quarta",
    4: "Quinta",
    5: "Sexta",
    6: "Sabado"
}

feriados = [
            ['Ano Novo', 1, 1],
            ['Carnaval', 24, 2],
            ['Carnaval', 25, 2],
            ['Carnaval', 26, 2],
            ['Dia de São José', 19, 3],
            ['Data Magna', 25, 3],
            ['Sexta-Feira Santa', 10, 4],
            ['Aniversário de Fortaleza', 13, 4],
            ['Tiradentes', 21, 4],
            ['Dia do Trabalho', 1, 5],
            ['Corpus Christi', 28, 5],
            ['N. Senhora da Assunção', 27, 5],
            ['Independência do Brasil', 7, 9],
            ['N. Senhora de Aparecida', 12, 10],
            ['Dia de Finados', 2, 11],
            ['Proclamação da Republica', 15, 11],
            ['Natal', 25, 12],
]

vesperas = [
            ['Ano Novo', 31, 12],
            ['Carnaval', 23, 2],
            ['Dia de São José', 18, 3],
            ['Data Magna', 24, 3],
            ['Sexta-Feira Santa', 9, 4],
            ['Aniversário de Fortaleza', 12, 4],
            ['Tiradentes', 20, 4],
            ['Dia do Trabalho', 30, 4],
            ['N. Senhora da Assunção', 26, 5],
            ['Independência do Brasil', 6, 9],
            ['N. Senhora de Aparecida', 11, 10],
            ['Dia de Finados', 1, 11],
            ['Proclamação da Republica', 14, 11],
            ['Natal', 24, 12],
]

In [4]:
one_hot_encoder = OneHotEncoder(sparse=False)

data_model[['domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado']] = one_hot_encoder.fit_transform(data_model['d_semana'].values.reshape(-1,1))
data_model['feriado'] = [1 if any((x[0] == d and x[1] == m) for (_, d, m) in feriados) else 0 for x in list(zip(data_model.d_mes, data_model.mes))]
data_model['vespera_feriado'] = [1 if any((x[0] == d and x[1] == m) for (_, d, m) in vesperas) else 0 for x in list(zip(data_model.d_mes, data_model.mes))]

top100_linhas = data_model.linha.value_counts().index[:100]
top100_linhas_data_model = data_model[data_model.linha.isin(top100_linhas)].loc[data['mes'] != 1]

line_data_model = data_model.loc[data['linha'] == busline_filter].loc[data['mes'] != 1].copy()

In [5]:
top100_linhas_data_model.to_csv('./top100.csv', index=False)
top100_linhas_data_model

Unnamed: 0,linha,data_hora,validations_per_hour,d_semana,hour_sin,hour_cos,hora,d_mes,d_ano,mes,semana_do_mes,domingo,segunda,terca,quarta,quinta,sexta,sabado,feriado,vespera_feriado
0,1,2020-03-01 05:00:00,39,6,9.790841e-01,0.203456,5,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
1,1,2020-03-01 06:00:00,233,6,9.976688e-01,-0.068242,6,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
2,1,2020-03-01 07:00:00,258,6,9.422609e-01,-0.334880,7,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
3,1,2020-03-01 08:00:00,238,6,8.169699e-01,-0.576680,8,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
4,1,2020-03-01 09:00:00,248,6,6.310879e-01,-0.775711,9,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1420557,920,2020-12-31 19:00:00,94,3,-8.878852e-01,0.460065,19,31,366,12,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1
1420558,920,2020-12-31 20:00:00,49,3,-7.308360e-01,0.682553,20,31,366,12,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1
1420559,920,2020-12-31 21:00:00,20,3,-5.195840e-01,0.854419,21,31,366,12,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1
1420560,920,2020-12-31 22:00:00,24,3,-2.697968e-01,0.962917,22,31,366,12,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1


In [6]:
#display(X)

In [7]:
#display(y)

---------------------------------------------------------------------------- Fim Setup ----------------------------------------------------------------------------  
# Criacao dos Modelos

In [8]:
model_list = []

In [9]:
def get_performance(model):
    y_test_predict = model.predict(X_test)
    mse = mean_squared_error(Y_test, y_test_predict)
    rmse = (np.sqrt(mse))
    r2 = r2_score(Y_test, y_test_predict)
    mae = mean_absolute_error(Y_test, y_test_predict)
    #mape = mean_absolute_percentage_error(Y_test, y_test_predict)
    performance_scoring = [
        r2,
        rmse,
        mae,
        #mape
    ]
    #performance_scoring = pd.DataFrame(performance_scoring,columns=['Metrica', 'Score'])
    #performance_scoring['Score'] = performance_scoring['Score'].astype('float64')
    return performance_scoring

### Grid Search

In [10]:
LinearRegressionModel = LinearRegression()
RandomForestModel = RandomForestRegressor()

#### Linear Regression Model Parameters

In [11]:
# print("Linear Regression  Params ----> ")
# pprint(LinearRegressionModel.get_params())

copy_X = [True, False]
fit_intercept = [True, False]
n_jobs = [2, 4, 6, 8]
normalize = [True, False]

# Create the random grid
random_grid_linearregression = {'copy_X': copy_X, 'fit_intercept': fit_intercept, 'n_jobs': n_jobs, 'normalize': normalize}

#### Random Forest Model Parameters

In [12]:
# print("Random Forest Model Params ----> ")
# pprint(RandomForestModel.get_params())

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 1000, num = 10)]

n_jobs = [2, 4, 6, 8]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid_randomforest = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'n_jobs': n_jobs}

In [13]:
X = line_data_model.filter(feature_names, axis=1)
y = line_data_model.validations_per_hour

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
LinearRegressionModel = LinearRegression()
RandomForestModel = RandomForestRegressor()
gridsearch_result_randomforest = RandomizedSearchCV(estimator = RandomForestModel, param_distributions = random_grid_randomforest, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
gridsearch_result_linearregression = RandomizedSearchCV(estimator = LinearRegressionModel, param_distributions = random_grid_linearregression, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

gridsearch_result_randomforest.fit(X_train, Y_train)
gridsearch_result_linearregression.fit(X_train, Y_train)

print("Random Forest Best Parameters -----> ")
pprint(gridsearch_result_randomforest.best_params_)

print("Linear Regression Best Parameters -----> ")
pprint(gridsearch_result_linearregression.best_params_)

gridsearched_random_randomforest = gridsearch_result_randomforest.best_estimator_
gridsearch_random_linearregression = gridsearch_result_linearregression.best_estimator_

default_linearregression_model = LinearRegressionModel.fit(X_train, Y_train)
default_randomforest_model = RandomForestModel.fit(X_train, Y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Random Forest Best Parameters -----> 
{'bootstrap': True,
 'max_depth': 110,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 237,
 'n_jobs': 2}
Linear Regression Best Parameters -----> 
{'copy_X': True, 'fit_intercept': False, 'n_jobs': 2, 'normalize': True}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  5.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    0.9s finished


In [14]:
sample = X.sample(n=1)
sample

Unnamed: 0,hour_sin,hour_cos,d_mes,d_ano,mes,semana_do_mes,domingo,segunda,terca,quarta,quinta,sexta,sabado,feriado,vespera_feriado
317922,-0.631088,-0.775711,6,127,5,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0


In [15]:
print("###  1 Modelo Por Linha (dado completo de treino)")

performance_default_linearregression_model = get_performance(default_linearregression_model)
performance_default_randomforest_model = get_performance(default_randomforest_model)
performance_gridsearched_linearregression_model = get_performance(gridsearch_random_linearregression)
performance_gridsearched_randomforest_model = get_performance(gridsearched_random_randomforest)

print("\n----------------------------------------------------\n")
print('Regressao Linear [DEFAULT]: \n', performance_default_linearregression_model)
print("\n----------------------------------------------------\n")
print('Random Forest [DEFAULT]: \n', performance_default_randomforest_model)
print("\n----------------------------------------------------\n")
print('Regressao Linear [GRID SEARCH]: \n', performance_gridsearched_linearregression_model)
print("\n----------------------------------------------------\n")
print('Random Forest [GRID SEARCH]: \n', performance_gridsearched_randomforest_model)
print("\n----------------------------------------------------\n")

print("\nUtilizando sample aleatorio de dado para teste de previsao: ")

predict_res = default_linearregression_model.predict(sample)

print("Regressao Linear -> resultado do predict de test: ", predict_res)

predict_res2 = default_randomforest_model.predict(sample)
print("Random Forest -> resultado do predict de test: ", predict_res2)

random_res = gridsearch_random_linearregression.predict(sample)
print("Regressao Linear Best Grid Searched -> resultado do predict de test: ", predict_res2)

random_res = gridsearched_random_randomforest.predict(sample)
print("Random Forest Best Grid Searched -> resultado do predict de test: ", predict_res2)

display(data_model.loc[sample.index[0]:sample.index[0]])

print('LINHA UTILIZADA PARA OS TESTES: ', busline_filter)

performance_total = [performance_default_linearregression_model,
    performance_default_randomforest_model,
    performance_gridsearched_linearregression_model,
    performance_gridsearched_randomforest_model]

performance_total = pd.DataFrame.from_records(performance_total, columns=['R2', 'RMSE', 'MAE'], 
    index=['LinearReg Default', 'RandomForest Default', 'LinearReg GridSearched', 'RandomForest GridSearched'])
performance_total

###  1 Modelo Por Linha (dado completo de treino)

----------------------------------------------------

Regressao Linear [DEFAULT]: 
 [0.38108258495687986, 212.46178497824638, 158.53020063087848]

----------------------------------------------------

Random Forest [DEFAULT]: 
 [0.9769859087291921, 40.96956850002502, 29.60537519760941]

----------------------------------------------------

Regressao Linear [GRID SEARCH]: 
 [0.3810825849568801, 212.46178497824636, 158.53020063087783]

----------------------------------------------------

Random Forest [GRID SEARCH]: 
 [0.9771209888469143, 40.84915708265401, 29.76700103065193]

----------------------------------------------------


Utilizando sample aleatorio de dado para teste de previsao: 
Regressao Linear -> resultado do predict de test:  [493.01329453]
Random Forest -> resultado do predict de test:  [256.64]
Regressao Linear Best Grid Searched -> resultado do predict de test:  [256.64]
Random Forest Best Grid Searched -> resultado do

Unnamed: 0,linha,data_hora,validations_per_hour,d_semana,hour_sin,hour_cos,hora,d_mes,d_ano,mes,semana_do_mes,domingo,segunda,terca,quarta,quinta,sexta,sabado,feriado,vespera_feriado
317922,41,2020-05-06 14:00:00,268,2,-0.631088,-0.775711,14,6,127,5,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0


LINHA UTILIZADA PARA OS TESTES:  41


Unnamed: 0,R2,RMSE,MAE
LinearReg Default,0.381083,212.461785,158.530201
RandomForest Default,0.976986,40.969569,29.605375
LinearReg GridSearched,0.381083,212.461785,158.530201
RandomForest GridSearched,0.977121,40.849157,29.767001


## BAGGING

In [16]:
#X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)
warnings.filterwarnings("ignore") # To ignore warnings
n_jobs = -1 # This parameter conrols the parallel processing. -1 means using all processors.
random_state = 42 # This parameter controls the randomness of the data. Using some int value to get same results everytime this code is run
models_scores = [] # To store model scores               

def bagging_model(estimator):
    """
    I/P
    estimator: The base estimator from which the ensemble is grown.
    O/P
    br_y_pred: Predictions on test data for the base estimator.
    
    """
    regr = BaggingRegressor(base_estimator=estimator,
                            n_estimators=10,
                            max_samples=1.0,
                            bootstrap=True, # Samples are drawn with replacement
                            n_jobs= n_jobs,
                            random_state=random_state).fit(X_train, Y_train)

    br_y_pred = regr.predict(X_test)

    performance = get_performance(regr)
    
    print(f'Performance for base estimator {regr.base_estimator_} = {performance}\n')

    return regr, performance

LinearRegressionModelPipeline = make_pipeline(LinearRegression())
RandomForestModelPipeline = make_pipeline(RandomForestRegressor())

bagging_lr_model, performance_bagging_lr_model = bagging_model(LinearRegressionModelPipeline)
bagging_rf_model, performance_bagging_rf_model = bagging_model(RandomForestModelPipeline)

df2 = pd.DataFrame([performance_bagging_lr_model, performance_bagging_rf_model], columns=['R2','RMSE','MAE'], 
    index=['LinearRegression Bagging', 'RandomForest Bagging'])
performance_total = pd.concat([df2, performance_total])


Performance for base estimator Pipeline(memory=None,
         steps=[('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False) = [0.38046972703661563, 212.56694979259984, 158.72459287274364]

Performance for base estimator Pipeline(memory=None,
         steps=[('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                 

# STACKING

In [17]:

estimators = [ ('random_forest', RandomForestModelPipeline), ('linear_reg', LinearRegressionModelPipeline)]

stack = StackingRegressor(estimators=estimators, final_estimator= RandomForestModelPipeline, cv= 5, n_jobs= n_jobs, passthrough = True)

stack.fit(X_train, Y_train)

performance = get_performance(stack)

df2 = pd.DataFrame([performance], columns=['R2','RMSE','MAE'], 
    index=['Stacking Regressors'])

performance_total = pd.concat([df2, performance_total])


# BOOSTING

In [18]:
gradient_boosting_regressor= GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state = random_state)

gradient_boosting_regressor.fit(X_train, Y_train)

performance = get_performance(gradient_boosting_regressor)

df2 = pd.DataFrame([performance], columns=['R2','RMSE','MAE'], 
    index=['GradientBoostingRegressor'])

performance_total = pd.concat([df2, performance_total])


In [19]:
print('PERFORMANCE DOS MODELOS PARA A LINHA: ', busline_filter)
performance_total.sort_values('R2', ascending=False)

PERFORMANCE DOS MODELOS PARA A LINHA:  41


Unnamed: 0,R2,RMSE,MAE
RandomForest GridSearched,0.977121,40.849157,29.767001
RandomForest Default,0.976986,40.969569,29.605375
Stacking Regressors,0.976485,41.413393,29.594434
RandomForest Bagging,0.976057,41.78809,29.900033
GradientBoostingRegressor,0.973293,44.1341,31.719544
LinearReg GridSearched,0.381083,212.461785,158.530201
LinearReg Default,0.381083,212.461785,158.530201
LinearRegression Bagging,0.38047,212.56695,158.724593


### Todos os modelos treinados dentro de *model_list*

In [20]:
model_list = [
    #default_linearregression_model,
    default_randomforest_model,
    #gridsearch_result_linearregression,
    #gridsearch_result_randomforest,
    #bagging_lr_model,
    bagging_rf_model,
    stack,
    #gradient_boosting_regressor
]

## Treinar com 1 mes, prever N semanas

In [33]:
# Jan, Fev, Mar, Abril, Maio
# treina com mar, preve 2 semanas de abril

line_mes_data_model = line_data_model[line_data_model.mes == 3]

line_mes_data_model_predict = line_data_model[line_data_model.mes == 4]

df_prev = line_mes_data_model_predict[line_mes_data_model_predict.semana_do_mes <=  2].filter(feature_names, axis = 1)
real_values = line_mes_data_model_predict[line_mes_data_model_predict.semana_do_mes <=  2].filter('validations_per_hour', axis = 1)

X = line_mes_data_model.filter(feature_names, axis=1)
y = line_mes_data_model.validations_per_hour
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

predictions = list()
performances = list()
df_prev
for i, model in enumerate(model_list):
        model.fit(X_train, Y_train)
        performances.append(get_performance(model))

        for index, row in df_prev.iterrows():
            predictions.append(model.predict([row])[0])
        
df_performance = pd.DataFrame(performances)
df_performance.to_csv('./performance_1mes(mar)_2semana(abr)_predict.csv', index=False)

In [34]:
predictions

[40.558,
 40.86833333333333,
 345.28,
 594.56,
 540.9,
 342.07,
 303.18,
 300.25,
 327.14,
 359.32,
 329.91,
 344.22,
 389.63,
 521.29,
 667.93,
 569.14,
 371.5,
 221.03,
 189.63,
 167.75,
 40.558,
 45.635,
 43.6545,
 364.45,
 657.14,
 603.15,
 379.94,
 334.94,
 333.18,
 352.74,
 381.43,
 356.02,
 361.66,
 417.2,
 565.28,
 710.11,
 609.62,
 377.07,
 223.45,
 185.85,
 172.73,
 45.635,
 54.78416666666668,
 47.09283333333333,
 426.0,
 700.47,
 645.71,
 418.82,
 364.27,
 356.38,
 375.34,
 413.16,
 374.11,
 387.03,
 448.34,
 607.3,
 754.99,
 636.76,
 399.89,
 231.2,
 186.83,
 171.89,
 54.78416666666668,
 45.2055,
 38.02583333333334,
 285.47,
 501.48,
 491.29,
 341.21,
 311.33,
 302.6,
 336.81,
 373.46,
 324.42,
 340.28,
 372.46,
 418.81,
 442.16,
 332.72,
 325.76,
 177.96,
 152.43,
 146.08,
 45.2055,
 54.06033333333334,
 182.34,
 234.42,
 251.47,
 187.96,
 177.42,
 156.34,
 175.15,
 205.46,
 178.2,
 213.92,
 228.66,
 253.9,
 282.91,
 257.63,
 281.52,
 171.27,
 156.01,
 123.91,
 54.060333333

In [37]:
plt.figure(figsize=(30, 10))
plt.plot(range(len(real_values)), real_values, color='blue')
plt.plot(range(len(predictions)), predictions, color='red')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()

ZeroDivisionError: integer division or modulo by zero

### Treinar com (N-Y) meses, Prever mes (N)

In [None]:
#X_train, Y_train
#X_test, Y_test
#X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)
# X = line_data_model.filter(feature_names, axis=1)
# y = line_data_model.validations_per_hour

#quantos meses (comecando do final do dado) vou prever

# model_list = [
#     default_linearregression_model,
#     default_randomforest_model,
#     gridsearch_result_linearregression,
#     gridsearch_result_randomforest,
#     bagging_lr_model,
#     bagging_rf_model,
#     stack,
#     gradient_boosting_regressor
# ]

for hzp in range(4, 8):
    horizonte_de_previsao = hzp
    meses = line_data_model.mes.sort_values().unique()
    horizonte_de_treinamento = meses[:meses.size - horizonte_de_previsao]
    meses_de_previsao = meses[meses.size - horizonte_de_previsao:]

    predictions = [[],[],[],[],[],[],[],[]]
    performances = [[],[],[],[],[],[],[],[]]

    df_treinamento = line_data_model[line_data_model.mes.isin(horizonte_de_treinamento)].copy()
    df_previsao = line_data_model[line_data_model.mes.isin(meses_de_previsao)].copy()
    df_prev = df_previsao.filter(feature_names, axis = 1)
    df_results = df_previsao.filter('validations_per_hour', axis = 1)

    print(df_results.__len__())

    X = df_treinamento.filter(feature_names, axis=1)
    y = df_treinamento.validations_per_hour
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

    for i, model in enumerate(model_list):
        model.fit(X_train, Y_train)
        performances[i].append(get_performance(model))

        for index, row in df_prev.iterrows():
            predictions[i].append(model.predict([row])[0])
    
    df_results = df_previsao.validations_per_hour

    df_performance = pd.DataFrame(performances)
    df_performance.to_csv('./performance_'+str(hzp)+'_meses.csv', index=False)

    plt_size = hzp * 360

    plt.figure(figsize=(30, 10))
    plt.plot(range(len(df_results) - plt_size), df_results[plt_size:], color='blue')
    plt.plot(range(len(predictions[2]) - plt_size ), predictions[2][plt_size:], color='red')
    plt.xlabel('True Values')
    plt.ylabel('Predictions')
    plt.show()

#print(predictions[0])


In [None]:
plt.figure(figsize=(30, 10))
plt.plot(range(len(df_results) - plt_size), df_results[plt_size:], color='blue')
plt.plot(range(len(predictions[2]) - plt_size ), predictions[0][plt_size:], color='red')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()

In [None]:
performances

In [None]:
df = pd.DataFrame(predictions)
df.to_csv('predictions0.csv', index=False)

In [None]:
# ACCESS IN ORDER PREDICTION VALUES
df.iloc[0]

In [None]:
# #top 100 exampled linhas
# #data_model.linha.value_counts().index[:100]

# print(data_model.linha.value_counts().index[:100])

d_31 = [1, 3, 5, 7, 8, 10, 12]
d_30 = [4, 6, 9, 11]

from datetime import datetime
import datetime
import calendar

def week_of_month(tgtdate):
    tgtdate = tgtdate.to_pydatetime()
    startdate = 0

    days_this_month = calendar.mdays[tgtdate.month]
    for i in range(1, days_this_month):
        d = datetime.datetime(tgtdate.year, tgtdate.month, i)
        if d.day - d.weekday() > 0:
            startdate = d
            break
    # now we canuse the modulo 7 appraoch
    return (tgtdate - startdate).days //7 + 1

# #ate outubro para prever novembro
# for linha in top100_linhas:
#     #linha filter
#     currentLinhaData = data_model[data_model.linha == linha]
#     print(currentLinhaData)
#     for mes in range(3, 12):
#         currentLinhaData = currentLinhaData[currentLinhaData.mes == mes]

#         if mes in d_31:
#             d_d = 31
#         elif mes in d_30:
#             d_d = 30
#         else:
#             d_d = 29
        
#         for dia in range(1, d_d):
#             currentLinhaData = currentLinhaData[currentLinhaData.d_mes == dia]
#             for hora in range(24):
#                 currentLinhaData = currentLinhaData[currentLinhaData.hora == hora]
#                 if currentLinhaData.empty:
#                     # feature_names = [
#                     # 'hour_sin', 'hour_cos', 
#                     # 'd_mes', 'mes', 'semana_do_mes', 
#                     # 'domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado', 
#                     # 'feriado', 'vespera_feriado']
#                     # target = 'validations_per_hour'

#                     # a_row = pd.Series([1, 2])
#                     # df = pd.DataFrame([[3, 4], [5, 6]])
#                     # row_df = pd.DataFrame([a_row])
#                     # df = pd.concat([row_df, df], ignore_index=True)

#                     h_sin = np.sin(2 * np.pi * hora/23.0)
#                     h_cos = np.cos(2 * np.pi * hora/23.0)
#                     semana_do_mes = (dia-1) // 7 + 1
#                     #dia do ano
#                     ins = pd.Series([linha, data_hora, 0, h_sin, h_cos, dia, mes, semana_do_mes *DIASDASEMANAENCODED, *feriado, *vesperaferiado])
#                     ins_df = pd.DataFrame([ins])

#                     currentLinhaData = pd.concat([ins_df, currentLinhaData], ignore_index=True)
#                     print(f"INSERINDO MISSING HORA {hora} no dia {dia} do mes {mes}")

# t = data_model[data_model.linha.isin(data_model.linha.value_counts().index[:100])]
# t

## Ranking Linhas de Onibus

In [None]:

# print(data_model.linha.value_counts(ascending=True)[data_model['linha'].value_counts() < 100])

# linha_lista = data_model.linha.unique()
# linha_lista = linha_lista[:10]
res_map = list()
for linha in top100_linhas:
    for model in model_list:
        currentLinhaData = data_model.loc[data['linha'] == linha].copy()
        if len(currentLinhaData) < 10:
            continue
        # X = currentLinhaData.filter(['hour_sin', 'hour_cos', 
        # 'd_mes', 'd_ano', 'mes', 'semana_do_mes', 
        # 'domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado', 
        # 'feriado', 'vespera_feriado'], axis=1)
        # y = currentLinhaData.validations_per_hour

        #FAZER TODOS OS MODELOS PARA AS LINHAS E ADICIONAR NA TABELA PRINCIPAL
        #RANDOM FOREST COM GRID SEARCH
        #LINEAR REGRESSION COM GRID SEARCH
        #BAGGING STACKING AND BOOSTING
        #
        model.fit(X_train, Y_train)
        performance = get_performance(model)
        performance.insert(0, linha)
        res_map.append(performance)



In [None]:
res_map[res_map.Linha == 213]
res_map.sort_values('MAE').reset_index(drop=True)

In [None]:
# res_map = pd.DataFrame(res_map,columns=['Linha', 
#     # '[LR][DEF]R2', '[LR][DF]RMSE', '[LR][DF]MAE',
#     '[RF][GS]R2', '[RF][GS]RMSE', '[RF][GS]MAE',
#     # '[LR][BAG]R2', '[LR][BAG]RMSE', '[LR][BAG]MAE',
#     '[RF][BAG]R2', '[RF][BAG]RMSE', '[RF][BAG]MAE',
#     '[STK]R2', '[STK]RMSE', '[STK]MAE',
#     # '[BOS]R2', '[BOS]RMSE', '[BOS]MAE' 
#     ])
res_map = pd.DataFrame(res_map,columns=['Linha','R2', 'RMSE', 'MAE'])

#performance_scoring['Score'] = performance_scoring['Score'].astype('float64')
res_map = res_map.sort_values('R2').reset_index(drop=True)
res_map.to_csv('./top100linhas_rank.csv', index=False)
res_map

## Avaliacao dos dados atualmente
404 linhas  
358 com pelo menos 10 exemplos



In [None]:
# res_map.to_csv('./classic-linha-comp.csv', index = False)