## Previsao de Embarque - Sem Serie Temporal

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from IPython.display import display
#from shutil import unpack_archive
#unpack_archive('./df_input.zip', './')

day_of_week_translator = {
    0: "Domingo",
    1: "Segunda",
    2: "Terca",
    3: "Quarta",
    4: "Quinta",
    5: "Sexta",
    6: "Sabado"
}

# Reading Data file (geolocalized)
data = pd.read_csv('./df_input.csv', sep=';', delimiter=';')

print('Quantidade de Linhas de Onibus', len(data.linha.unique()))

busline_filter = 41
model_data = data.loc[data['linha'] == busline_filter]
model_data

Quantidade de Linhas de Onibus 404


Unnamed: 0,linha,data_hora,validations_per_hour,d_semana,hour_sin,hour_cos,hora,d_mes,d_ano,mes,semana_do_mes
2418,41,2020-03-01 04:00:00,25,6,8.878852e-01,0.460065,4,1,61,3,0
2419,41,2020-03-01 05:00:00,168,6,9.790841e-01,0.203456,5,1,61,3,0
2420,41,2020-03-01 06:00:00,310,6,9.976688e-01,-0.068242,6,1,61,3,0
2421,41,2020-03-01 07:00:00,339,6,9.422609e-01,-0.334880,7,1,61,3,0
2422,41,2020-03-01 08:00:00,286,6,8.169699e-01,-0.576680,8,1,61,3,0
...,...,...,...,...,...,...,...,...,...,...,...
1418148,41,2020-12-31 20:00:00,138,3,-7.308360e-01,0.682553,20,31,366,12,4
1418149,41,2020-12-31 21:00:00,83,3,-5.195840e-01,0.854419,21,31,366,12,4
1418150,41,2020-12-31 22:00:00,54,3,-2.697968e-01,0.962917,22,31,366,12,4
1418151,41,2020-12-31 23:00:00,30,3,-2.449294e-16,1.000000,23,31,366,12,4


In [2]:
one_hot_encoder = OneHotEncoder(sparse=False)
encoded_model_data = model_data.copy()
encoded_model_data[['domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado']] = one_hot_encoder.fit_transform(model_data['d_semana'].values.reshape(-1,1))

X = encoded_model_data.filter(['semana_do_mes','linha', 'd_ano', 'hora', 'hour_sin', 'hour_cos', 'd_mes', 'domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado'], axis=1)
y = encoded_model_data.validations_per_hour
display(X)
display(y)

Unnamed: 0,semana_do_mes,linha,d_ano,hora,hour_sin,hour_cos,d_mes,domingo,segunda,terca,quarta,quinta,sexta,sabado
2418,0,41,61,4,8.878852e-01,0.460065,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2419,0,41,61,5,9.790841e-01,0.203456,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2420,0,41,61,6,9.976688e-01,-0.068242,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2421,0,41,61,7,9.422609e-01,-0.334880,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2422,0,41,61,8,8.169699e-01,-0.576680,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418148,4,41,366,20,-7.308360e-01,0.682553,31,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1418149,4,41,366,21,-5.195840e-01,0.854419,31,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1418150,4,41,366,22,-2.697968e-01,0.962917,31,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1418151,4,41,366,23,-2.449294e-16,1.000000,31,0.0,0.0,0.0,1.0,0.0,0.0,0.0


2418        25
2419       168
2420       310
2421       339
2422       286
          ... 
1418148    138
1418149     83
1418150     54
1418151     30
1418152      2
Name: validations_per_hour, Length: 6172, dtype: int64

In [3]:
LinearRegressionModel = LinearRegression()
RandomForestModel = RandomForestRegressor(n_jobs=6)

def get_performance(model, X_test, Y_test):
    y_test_predict = model.predict(X_test)
    mse = mean_squared_error(Y_test, y_test_predict)
    rmse = (np.sqrt(mse))
    r2 = r2_score(Y_test, y_test_predict)
    mean = mean_absolute_error(Y_test, y_test_predict)
    mape = mean_absolute_percentage_error(Y_test, y_test_predict)
    performance_scoring = [
        ("MSE", mse),
        ("RMSE", rmse),
        ("R2", r2),
        ("MAE", mean),
        ("MAPE", mape)
    ]
    performance_scoring = pd.DataFrame(performance_scoring,columns=['Metrica', 'Score'])
    performance_scoring['Score'] = performance_scoring['Score'].astype('float64')
    return performance_scoring

def single_busline_model(model, X, y):
    single_busline_model = model
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.1, random_state=5)
    single_busline_model.fit(X_train, Y_train)
    performance_scoring = get_performance(model, X_test, Y_test)
    return single_busline_model, performance_scoring

def train_28th_predict_29th(model):
    day29_model = model

    day29_model_data = encoded_model_data.copy()
    day29_model_data = day29_model_data[(day29_model_data.d_mes < 30) & (day29_model_data.mes == 11)]

    X = day29_model_data.filter(['linha', 'd_ano', 'hora', 'hour_sin', 'hour_cos', 'd_mes', 'domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado'], axis=1)
    y = day29_model_data.validations_per_hour
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)
    day29_model.fit(X_train, Y_train)
    performance_scoring = get_performance(day29_model, X_test, Y_test)

    c_line_data = encoded_model_data[(encoded_model_data.d_mes >= 30) & (encoded_model_data.mes == 11)]

    prev = np.zeros(len(c_line_data.hora.unique()))
    for index, hora  in enumerate(c_line_data.hora.unique()):
        h_sin= np.sin(2 * np.pi * hora/23.0)
        h_cos = np.cos(2 * np.pi * hora/23.0)
        prev[index] = day29_model.predict([[busline_filter, 1, h_sin, h_cos]])

    
    actual = c_line_data.validations_per_hour

    fig = plt.figure(figsize=(12, 5))
    plt.xticks(c_line_data.hora.unique())
    plt.plot(c_line_data.hora.unique(), prev, label="predicted")
    plt.xlabel("Hora do dia")
    plt.ylabel("Qtd de Validacoes")
    plt.title("Treino ate 28 de Novembro 2020 -> Previsao 29 de Novembro 2020 ")
    plt.plot(c_line_data.hora.unique(), c_line_data.validations_per_hour, label="actual")
    plt.legend(loc="upper right")
    plt.show()
    display(pd.DataFrame({'predicted': prev, 'actual': actual}))

    return day29_model, performance_scoring

def train_3week(model):
    week3_model = model

    week3_model_data = encoded_model_data.copy()
    
    week3_model_data = week3_model_data[(week3_model_data.d_mes < 20) & (week3_model_data.mes == 11)]

    X = week3_model_data.filter(['linha', 'd_ano', 'hora', 'hour_sin', 'hour_cos', 'd_mes', 'domingo','segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado'], axis=1)
    y = week3_model_data.validations_per_hour
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=5)
    week3_model.fit(X_train, Y_train)

    c_line_data = encoded_model_data[(encoded_model_data.d_mes >= 24) & (encoded_model_data.mes == 11)]

    shape = (len(c_line_data.d_semana.unique()), len(c_line_data.hora.unique()))

    prev = np.zeros(shape)
    for idx, dia in enumerate(c_line_data.d_semana.unique()):
        for index, hora  in enumerate(c_line_data.hora.unique()):
            h_sin= np.sin(2 * np.pi * hora/23.0)
            h_cos = np.cos(2 * np.pi * hora/23.0)
            prev[idx, index] = week3_model.predict([[busline_filter, idx, h_sin, h_cos]])
    
    for idx, dia in enumerate(c_line_data.d_semana.unique()):
        actual = c_line_data[(chosen_line_data.d_semana == idx)].validations_per_hour
        if len(actual) != len(prev[idx]):
            continue

        fig = plt.figure(figsize=(12, 5))
        plt.xticks(c_line_data[(chosen_line_data.d_semana == idx)].hora.unique())
        plt.plot(c_line_data[(chosen_line_data.d_semana == idx)].hora.unique(), prev[idx], label="predicted")
        plt.plot(c_line_data[(chosen_line_data.d_semana == idx)].hora.unique(), c_line_data[(chosen_line_data.d_semana == idx)].validations_per_hour, label="actual")
        plt.xlabel("Hora do dia")
        plt.ylabel("Qtd de Validacoes")
        plt.title('Dia da semana: ', day_of_week_translator[idx])
        plt.legend(loc="upper right")
        plt.show()
        print(pd.DataFrame({'predicted': prev[idx], 'actual': actual}))

    performance_scoring = get_performance(week3_model, X_test, Y_test)

    return week3_model, performance_scoring

In [4]:
print("###  1 Modelo Por Linha (mes completo de treino)")

model_per_line_lr, model_per_line_lr_performance = single_busline_model(LinearRegressionModel, X, y)
model_per_line_rf, model_per_line_rf_performance = single_busline_model(RandomForestModel, X, y)

print('Regressao Linear: \n', model_per_line_lr_performance)
print("\n----------------------------------------------------\n")
print('Random Forest: \n', model_per_line_rf_performance)

print("\nUtilizando sample aleatorio de dado para teste de previsao: ")

sample = X.sample(n=1)
predict_res = model_per_line_lr.predict(sample)
display(model_data.loc[sample.index[0]:sample.index[0]])
print("Regressao Linear -> resultado do predict de test: ", predict_res)

predict_res2 = model_per_line_rf.predict(sample)
print("Random Forest -> resultado do predict de test: ", predict_res2)

###  1 Modelo Por Linha (mes completo de treino)
Regressao Linear: 
   Metrica         Score
0     MSE  48832.575171
1    RMSE    220.980938
2      R2      0.369945
3     MAE    165.548978
4    MAPE      1.967338

----------------------------------------------------

Random Forest: 
   Metrica        Score
0     MSE  2303.903407
1    RMSE    47.998994
2      R2     0.970274
3     MAE    29.804838
4    MAPE     0.137326

Utilizando sample aleatorio de dado para teste de previsao: 


Unnamed: 0,linha,data_hora,validations_per_hour,d_semana,hour_sin,hour_cos,hora,d_mes,d_ano,mes,semana_do_mes
542606,41,2020-06-25 14:00:00,360,3,-0.631088,-0.775711,14,25,177,6,4


Regressao Linear -> resultado do predict de test:  [473.15850069]
Random Forest -> resultado do predict de test:  [363.75]


In [5]:
day28_lr_model, day28_lr_performance = train_28th_predict_29th(LinearRegressionModel)
print("Regressao Linear: \n", day28_lr_performance)

day28_rf_model, day28_rf_performance =  train_28th_predict_29th(RandomForestModel)
print("Random Forest: \n", day28_rf_performance)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 13 is different from 4)

In [None]:
print("###  Treina 3 semanas -> predict semana 4")

week3_lr_model, week3_lr_performance = train_3week(LinearRegressionModel)
print("Performance: ", week3_lr_performance)
week3_rf_model, week3_rf_performance = train_3week(RandomForestModel)
print("Performance: ", week3_rf_performance)