## Previsao de Embarque 
- Simples 
- Sem Serie Temporal
- Regressao Linear
- Random Forest

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_regression

import warnings

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
#from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import RandomizedSearchCV
from IPython.display import display
from pprint import pprint
from math import sqrt

#from shutil import unpack_archive
#unpack_archive('./df_input.zip', './')

## Filtro de Onibus e definicoes de features/target

In [3]:
# Reading Data file (geolocalized)
data = pd.read_csv('./df_input.csv', sep=';', delimiter=';')

busline_filter = 41
data_model = data.copy()

feature_names = [
    'hour_sin', 'hour_cos', 
    'd_mes', 'd_ano', 'mes', 'semana_do_mes', 
    'domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado', 
    'feriado', 'vespera_feriado']

target = 'validations_per_hour'

In [4]:
day_of_week_translator = {
    0: "Domingo",
    1: "Segunda",
    2: "Terca",
    3: "Quarta",
    4: "Quinta",
    5: "Sexta",
    6: "Sabado"
}

feriados = [
            ['Ano Novo', 1, 1],
            ['Carnaval', 24, 2],
            ['Carnaval', 25, 2],
            ['Carnaval', 26, 2],
            ['Dia de São José', 19, 3],
            ['Data Magna', 25, 3],
            ['Sexta-Feira Santa', 10, 4],
            ['Aniversário de Fortaleza', 13, 4],
            ['Tiradentes', 21, 4],
            ['Dia do Trabalho', 1, 5],
            ['Corpus Christi', 28, 5],
            ['N. Senhora da Assunção', 27, 5],
            ['Independência do Brasil', 7, 9],
            ['N. Senhora de Aparecida', 12, 10],
            ['Dia de Finados', 2, 11],
            ['Proclamação da Republica', 15, 11],
            ['Natal', 25, 12],
]

vesperas = [
            ['Ano Novo', 31, 12],
            ['Carnaval', 23, 2],
            ['Dia de São José', 18, 3],
            ['Data Magna', 24, 3],
            ['Sexta-Feira Santa', 9, 4],
            ['Aniversário de Fortaleza', 12, 4],
            ['Tiradentes', 20, 4],
            ['Dia do Trabalho', 30, 4],
            ['N. Senhora da Assunção', 26, 5],
            ['Independência do Brasil', 6, 9],
            ['N. Senhora de Aparecida', 11, 10],
            ['Dia de Finados', 1, 11],
            ['Proclamação da Republica', 14, 11],
            ['Natal', 24, 12],
]

In [None]:
one_hot_encoder = OneHotEncoder(sparse=False)

data_model[['domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado']] = one_hot_encoder.fit_transform(data_model['d_semana'].values.reshape(-1,1))
data_model['feriado'] = [1 if any((x[0] == d and x[1] == m) for (_, d, m) in feriados) else 0 for x in list(zip(data_model.d_mes, data_model.mes))]
data_model['vespera_feriado'] = [1 if any((x[0] == d and x[1] == m) for (_, d, m) in vesperas) else 0 for x in list(zip(data_model.d_mes, data_model.mes))]

line_data_model = data_model.loc[data['linha'] == busline_filter].copy()

X = line_data_model.filter(feature_names, axis=1)
y = line_data_model.validations_per_hour

In [None]:
display(line_data_model)
display(X)
display(y)

---------------------------------------------------------------------------- Fim Setup ----------------------------------------------------------------------------  
# Criacao dos Modelos

In [7]:
def singlebusline_model(model, X, y):
    singlebusline_model = model
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)
    singlebusline_model.fit(X_train, Y_train)
    performance_scoring = get_performance(model, X_test, Y_test)
    return singlebusline_model, performance_scoring

def get_performance(model, X_test, Y_test):
    y_test_predict = model.predict(X_test)
    mse = mean_squared_error(Y_test, y_test_predict)
    rmse = (np.sqrt(mse))
    r2 = r2_score(Y_test, y_test_predict)
    mae = mean_absolute_error(Y_test, y_test_predict)
    #mape = mean_absolute_percentage_error(Y_test, y_test_predict)
    performance_scoring = [
        r2,
        rmse,
        mae,
        #mape
    ]
    #performance_scoring = pd.DataFrame(performance_scoring,columns=['Metrica', 'Score'])
    #performance_scoring['Score'] = performance_scoring['Score'].astype('float64')
    return performance_scoring

### Grid Search

In [12]:
LinearRegressionModel = LinearRegression()
RandomForestModel = RandomForestRegressor()

#### Linear Regression Model Parameters

In [13]:
print("Linear Regression  Params ----> ")
pprint(LinearRegressionModel.get_params())

copy_X = [True, False]
fit_intercept = [True, False]
n_jobs = [2, 4, 6, 8]
normalize = [True, False]


# Create the random grid
random_grid_linearregression = {'copy_X': copy_X, 'fit_intercept': fit_intercept, 'n_jobs': n_jobs, 'normalize': normalize}

print("Random Grid - LINEAR REGRESSION")
pprint(random_grid_linearregression)

Linear Regression  Params ----> 
{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}
Random Grid - LINEAR REGRESSION
{'copy_X': [True, False],
 'fit_intercept': [True, False],
 'n_jobs': [2, 4, 6, 8],
 'normalize': [True, False]}


#### Random Forest Model Parameters

In [14]:
print("Random Forest Model Params ----> ")
pprint(RandomForestModel.get_params())

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 1000, num = 10)]

n_jobs = [2, 4, 6, 8]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid_randomforest = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'n_jobs': n_jobs}

print("Random Grid - RANDOM FOREST")
pprint(random_grid_randomforest)

Random Forest Model Params ----> 
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
Random Grid - RANDOM FOREST
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [20, 128, 237, 346, 455, 564, 673, 782, 891, 1000],
 'n_jobs': [2, 4, 6, 8]}


In [19]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
#LinearRegressionModel = LinearRegression()
#RandomForestModel = RandomForestRegressor()
gridsearch_result_randomforest = RandomizedSearchCV(estimator = RandomForestModel, param_distributions = random_grid_randomforest, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
gridsearch_result_linearregression = RandomizedSearchCV(estimator = LinearRegressionModel, param_distributions = random_grid_linearregression, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

gridsearch_result_randomforest.fit(X_train, Y_train)
gridsearch_result_linearregression.fit(X_train, Y_train)

print("Random Forest Best Parameters -----> ")
pprint(gridsearch_result_randomforest.best_params_)

print("Linear Regression Best Parameters -----> ")
pprint(gridsearch_result_linearregression.best_params_)

best_random_randomforest = gridsearch_result_randomforest.best_estimator_
best_random_linearregression = gridsearch_result_linearregression.best_estimator_


Fitting 3 folds for each of 32 candidates, totalling 96 fits
Linear Regression Best Parameters -----> 
{'copy_X': True, 'fit_intercept': False, 'n_jobs': 2, 'normalize': True}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    0.9s finished


In [20]:
sample = X.sample(n=1)
sample

Unnamed: 0,hour_sin,hour_cos,d_mes,d_ano,mes,semana_do_mes,domingo,segunda,terca,quarta,quinta,sexta,sabado,feriado,vespera_feriado
1018497,0.887885,0.460065,7,281,10,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0


In [26]:
print("###  1 Modelo Por Linha (dado completo de treino)")

model_per_line_lr, model_per_line_lr_performance = singlebusline_model(LinearRegressionModel, X, y)
model_per_line_rf, model_per_line_rf_performance = singlebusline_model(RandomForestModel, X, y)
print("\n----------------------------------------------------\n")
print('Regressao Linear [DEFAULT]: \n', model_per_line_lr_performance)
print("\n----------------------------------------------------\n")
print('Random Forest [DEFAULT]: \n', model_per_line_rf_performance)
print("\n----------------------------------------------------\n")
print('Regressao Linear [GRID SEARCH]: \n', get_performance(best_random_randomforest, X_test,Y_test))
print("\n----------------------------------------------------\n")
print('Random Forest [GRID SEARCH]: \n', get_performance(best_random_linearregression, X_test,Y_test))
print("\n----------------------------------------------------\n")

print("\nUtilizando sample aleatorio de dado para teste de previsao: ")

predict_res = model_per_line_lr.predict(sample)

print("Regressao Linear -> resultado do predict de test: ", predict_res)

predict_res2 = model_per_line_rf.predict(sample)
print("Random Forest -> resultado do predict de test: ", predict_res2)

random_res = best_random_randomforest.predict(sample)
print("Random Forest Best Grid Searched -> resultado do predict de test: ", predict_res2)

random_res = best_random_linearregression.predict(sample)
print("Linear Regression Best Grid Searched -> resultado do predict de test: ", predict_res2)

display(data_model.loc[sample.index[0]:sample.index[0]])

###  1 Modelo Por Linha (dado completo de treino)
Regressao Linear [DEFAULT]: 
 [0.3724476627164204, 215.73460401278615, 161.58880229914652]

----------------------------------------------------

Random Forest [DEFAULT]: 
 [0.9772059277834443, 41.115520837086784, 29.505570859841917]
Regressao Linear [GRID SEARCH]: 
 [0.9770620467299195, 41.24508168047433, 29.59751188904919]

----------------------------------------------------

Random Forest [GRID SEARCH]: 
 [0.37244766271642116, 215.734604012786, 161.58880229914632]

Utilizando sample aleatorio de dado para teste de previsao: 
Regressao Linear -> resultado do predict de test:  [325.31825494]
Random Forest -> resultado do predict de test:  [48.83]
Random Forest Best Grid Searched -> resultado do predict de test:  [48.83]
Linear Regression Best Grid Searched -> resultado do predict de test:  [48.83]


Unnamed: 0,linha,data_hora,validations_per_hour,d_semana,hour_sin,hour_cos,hora,d_mes,d_ano,mes,semana_do_mes,domingo,segunda,terca,quarta,quinta,sexta,sabado,feriado,vespera_feriado
1018497,41,2020-10-07 04:00:00,47,2,0.887885,0.460065,4,7,281,10,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0


## BAGGING

In [10]:
#X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)
warnings.filterwarnings("ignore") # To ignore warnings
n_jobs = -1 # This parameter conrols the parallel processing. -1 means using all processors.
random_state = 42 # This parameter controls the randomness of the data. Using some int value to get same results everytime this code is run
models_scores = [] # To store model scores

def rmse(model):
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    
    return mean_squared_error(Y_test, y_pred, squared= False) # squared= False > returns Root Mean Square Error                  

def bagging_predictions(estimator):
    """
    I/P
    estimator: The base estimator from which the ensemble is grown.
    O/P
    br_y_pred: Predictions on test data for the base estimator.
    
    """
    regr = BaggingRegressor(base_estimator=estimator,
                            n_estimators=10,
                            max_samples=1.0,
                            bootstrap=True, # Samples are drawn with replacement
                            n_jobs= n_jobs,
                            random_state=random_state).fit(X_train, Y_train)

    br_y_pred = regr.predict(X_test)

    rmse_val = mean_squared_error(Y_test, br_y_pred, squared= False) # squared= False > returns Root Mean Square Error   

    print(f'RMSE for base estimator {regr.base_estimator_} = {rmse_val}\n')
    return br_y_pred

linear_regression = make_pipeline(LinearRegression())
rf_regressor = make_pipeline(RandomForestRegressor())
#rf_regressor = make_pipeline(RandomForestRegressor())

predictions = np.column_stack((bagging_predictions(linear_regression),
                              bagging_predictions(rf_regressor),))

print(f"Bagged predictions shape: {predictions.shape}")
       
y_pred = np.mean(predictions, axis=1)
print("Aggregated predictions (y_pred) shape", y_pred.shape)

rmse_val = mean_squared_error(Y_test, y_pred, squared= False) # squared= False > returns Root Mean Square Error   
models_scores.append(['Bagging', rmse_val])

print(f'\nBagging RMSE= {rmse_val}')

RMSE for base estimator Pipeline(memory=None,
         steps=[('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False) = 215.75112692985846

RMSE for base estimator Pipeline(memory=None,
         steps=[('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=None,
                    

# STACKING

In [15]:
estimators = [ ('random_forest', rf_regressor), ('linear_reg', linear_regression)]

stack = StackingRegressor(estimators=estimators, final_estimator= rf_regressor, cv= 5, n_jobs= n_jobs, passthrough = True)

stack.fit(X_train, Y_train)

pred = stack.predict(X_test)

rmse_val = mean_squared_error(Y_test, pred, squared= False) # squared= False > returns Root Mean Square Error    
models_scores.append(['Stacking', rmse_val])
print(f'rmse= {rmse_val}')

rmse= 40.124796479651025


# BOOSTING

In [16]:
gradient_boosting_regressor= GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state = random_state)

score = rmse(gradient_boosting_regressor)
models_scores.append(['GradientBoostingRegressor', score])
print(f'GradientBoostingRegressor Score= {score}')

GradientBoostingRegressor Score= 45.585891583534995


## Ranking Linhas de Onibus

In [11]:
#data_model
#data_model.loc[data['linha'] == busline_filter].copy()
print(data_model.linha.value_counts(ascending=True)[data_model['linha'].value_counts() < 100])
linha_lista = data_model.linha.unique()

#linha_lista = linha_lista[:5]
res_map = list()
for linha in linha_lista:
    #print(linha) it is working
    currentLinhaData = data_model.loc[data['linha'] == linha].copy()
    #print(len(currentLinhaData))
    if len(currentLinhaData) < 10:
        continue
    X = currentLinhaData.filter(['hour_sin', 'hour_cos', 
    'd_mes', 'd_ano', 'mes', 'semana_do_mes', 
    'domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado', 
    'feriado', 'vespera_feriado'], axis=1)
    y = currentLinhaData.validations_per_hour

    #FAZER TODOS OS MODELOS PARA AS LINHAS E ADICIONAR NA TABELA PRINCIPAL
    #RANDOM FOREST COM GRID SEARCH
    #LINEAR REGRESSION COM GRID SEARCH
    #BAGGING STACKING AND BOOSTING
    #
    model, performance = singlebusline_model(RandomForestModel, X, y)
    performance.insert(0, linha)
    res_map.append(performance)

res_map = pd.DataFrame(res_map,columns=['Linha', '[RF][GS]R2', '[RF][GS]RMSE', '[RF][GS]MAE',
    '[LR][GS]R2', '[LR][GS]RMSE', '[LR][GS]MAE',
    '[BAG][GS]R2', '[BAG][GS]RMSE', '[BAG][GS]MAE',
    '[STK][GS]R2', '[STK][GS]RMSE', '[STK][GS]MAE',
    '[BOS][GS]R2', '[BOS][GS]RMSE', '[BOS][GS]MAE' ])

#performance_scoring['Score'] = performance_scoring['Score'].astype('float64')
res_map = res_map.sort_values('R2').reset_index(drop=True)
res_map

941     1
233     1
811     1
202     1
935     1
       ..
140    69
40     76
999    79
91     81
33     84
Name: linha, Length: 63, dtype: int64


Unnamed: 0,Linha,R2,RMSE,MAE
0,999,-36.631640,70.745044,41.141250
1,145,-7.249889,6.259950,5.577500
2,47,-3.948501,2.558021,1.709091
3,413,-3.481755,26.462695,21.495000
4,138,-3.076205,12.875002,12.593333
...,...,...,...,...
353,38,0.976998,30.804185,20.014910
354,757,0.977344,30.453935,20.463345
355,345,0.978740,14.837540,9.744331
356,660,0.979762,27.644751,17.924536


## Avaliacao dos dados atualmente
404 linhas  
358 com pelo menos 10 exemplos



In [12]:
res_map.to_csv('./classic-linha-comp.csv', index = False)