## Previsao de Embarque - Sem Serie Temporal

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from IPython.display import display
#from shutil import unpack_archive
#unpack_archive('./df_input.zip', './')

day_of_week_translator = {
    0: "Domingo",
    1: "Segunda",
    2: "Terca",
    3: "Quarta",
    4: "Quinta",
    5: "Sexta",
    6: "Sabado"
}

# Reading Data file (geolocalized)
data = pd.read_csv('./df_input.csv', sep=';', delimiter=';')

busline_filter = 41
data_model = data.loc[data['linha'] == busline_filter]

one_hot_encoder = OneHotEncoder(sparse=False)
encoded_model_data = data_model.copy()
encoded_model_data[['domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado']] = one_hot_encoder.fit_transform(data_model['d_semana'].values.reshape(-1,1))

display(encoded_model_data)

Unnamed: 0,linha,data_hora,validations_per_hour,d_semana,hour_sin,hour_cos,hora,d_mes,d_ano,mes,semana_do_mes,domingo,segunda,terca,quarta,quinta,sexta,sabado
2418,41,2020-03-01 04:00:00,25,6,8.878852e-01,0.460065,4,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2419,41,2020-03-01 05:00:00,168,6,9.790841e-01,0.203456,5,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2420,41,2020-03-01 06:00:00,310,6,9.976688e-01,-0.068242,6,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2421,41,2020-03-01 07:00:00,339,6,9.422609e-01,-0.334880,7,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2422,41,2020-03-01 08:00:00,286,6,8.169699e-01,-0.576680,8,1,61,3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418148,41,2020-12-31 20:00:00,138,3,-7.308360e-01,0.682553,20,31,366,12,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1418149,41,2020-12-31 21:00:00,83,3,-5.195840e-01,0.854419,21,31,366,12,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1418150,41,2020-12-31 22:00:00,54,3,-2.697968e-01,0.962917,22,31,366,12,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1418151,41,2020-12-31 23:00:00,30,3,-2.449294e-16,1.000000,23,31,366,12,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [2]:
X = encoded_model_data.filter(['semana_do_mes', 'd_ano', 'hour_sin', 'hour_cos', 'd_mes', 'domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado'], axis=1)
y = encoded_model_data.validations_per_hour
display(X)
display(y)

Unnamed: 0,semana_do_mes,d_ano,hour_sin,hour_cos,d_mes,domingo,segunda,terca,quarta,quinta,sexta,sabado
2418,0,61,8.878852e-01,0.460065,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2419,0,61,9.790841e-01,0.203456,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2420,0,61,9.976688e-01,-0.068242,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2421,0,61,9.422609e-01,-0.334880,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2422,0,61,8.169699e-01,-0.576680,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1418148,4,366,-7.308360e-01,0.682553,31,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1418149,4,366,-5.195840e-01,0.854419,31,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1418150,4,366,-2.697968e-01,0.962917,31,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1418151,4,366,-2.449294e-16,1.000000,31,0.0,0.0,0.0,1.0,0.0,0.0,0.0


2418        25
2419       168
2420       310
2421       339
2422       286
          ... 
1418148    138
1418149     83
1418150     54
1418151     30
1418152      2
Name: validations_per_hour, Length: 6172, dtype: int64

In [3]:
LinearRegressionModel = LinearRegression()
RandomForestModel = RandomForestRegressor(n_jobs=6)

def get_performance(model, X_test, Y_test):
    y_test_predict = model.predict(X_test)
    mse = mean_squared_error(Y_test, y_test_predict)
    rmse = (np.sqrt(mse))
    r2 = r2_score(Y_test, y_test_predict)
    mean = mean_absolute_error(Y_test, y_test_predict)
    mape = mean_absolute_percentage_error(Y_test, y_test_predict)
    performance_scoring = [
        ("R2", r2),
        ("RMSE", rmse),
        ("MAE", mean),
        ("MAPE", mape)
    ]
    performance_scoring = pd.DataFrame(performance_scoring,columns=['Metrica', 'Score'])
    performance_scoring['Score'] = performance_scoring['Score'].astype('float64')
    return performance_scoring

def single_busline_model(model, X, y):
    single_busline_model = model
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)
    single_busline_model.fit(X_train, Y_train)
    performance_scoring = get_performance(model, X_test, Y_test)
    return single_busline_model, performance_scoring

In [4]:
print("###  1 Modelo Por Linha (dado completo de treino)")

model_per_line_lr, model_per_line_lr_performance = single_busline_model(LinearRegressionModel, X, y)
model_per_line_rf, model_per_line_rf_performance = single_busline_model(RandomForestModel, X, y)

print('Regressao Linear: \n', model_per_line_lr_performance)
print("\n----------------------------------------------------\n")
print('Random Forest: \n', model_per_line_rf_performance)

print("\nUtilizando sample aleatorio de dado para teste de previsao: ")

sample = X.sample(n=1)
predict_res = model_per_line_lr.predict(sample)
display(data_model.loc[sample.index[0]:sample.index[0]])
print("Regressao Linear -> resultado do predict de test: ", predict_res)

predict_res2 = model_per_line_rf.predict(sample)
print("Random Forest -> resultado do predict de test: ", predict_res2)

###  1 Modelo Por Linha (dado completo de treino)
Regressao Linear: 
   Metrica       Score
0      R2    0.352240
1    RMSE  219.180487
2     MAE  163.128107
3    MAPE    2.029277

----------------------------------------------------

Random Forest: 
   Metrica      Score
0      R2   0.961271
1    RMSE  53.593430
2     MAE  34.437623
3    MAPE   0.323495

Utilizando sample aleatorio de dado para teste de previsao: 


Unnamed: 0,linha,data_hora,validations_per_hour,d_semana,hour_sin,hour_cos,hora,d_mes,d_ano,mes,semana_do_mes
117209,41,2020-03-22 16:00:00,98,6,-0.942261,-0.33488,16,22,82,3,3


Regressao Linear -> resultado do predict de test:  [151.78167941]
Random Forest -> resultado do predict de test:  [95.18]
