# Using the Median and Mean of $n$ buses in a period of 30 minutes

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import heapq 
import pickle
%matplotlib inline
    
import plotnine as p9
import matplotlib.pyplot as plt
import dateutil
import datetime 
from dateutil.rrule import DAILY, rrule, MO, TU, WE, TH, FR

from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge


from sklearn import tree
import math

import warnings
warnings.filterwarnings('ignore')
    

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


import sys
sys.path.append("../")

with open('../data/linhas.pkl', 'rb') as inputfile:  
    linhas = pickle.load(inputfile)

with open('../data/stops_distances_fixed10.pkl', 'rb') as input_file:
    distancias_pontos = pickle.load(input_file)
    
with open('../data/linesedges.pkl', 'rb') as input_file:
    linesedge = pickle.load(input_file)
    
with open('../data/edgeslines.pkl', 'rb') as input_file:
    edgeslines = pickle.load(input_file)


In [2]:
# for line in range(0,len(linhas)):
for line in range(0,1):    
    linktt = pd.DataFrame(columns=['_id', 'aproxlinkstart', 'hour', 'link', 'ltt', 
                               'month', 'travel_id', 'trip_id', 'weekday'])
    
    for iterated_link in range(0, len(linesedge[linhas[line]])):
    
#     for iterated_link in (range(0, 5)):
        with open('../../../../../GIT/smartbus/data/linktt/link-' + str(linesedge[linhas[line]][iterated_link]) + '.pkl', 'rb') as inputfile:  
            linktt_temp = pickle.load(inputfile)
            linktt = pd.concat([linktt_temp, linktt],sort=True)  
            
    start_date = datetime.date(2017, 9, 1)       
    end_date = datetime.date(2017, 9, 28)  

    linktt['aproxlinkstart'] = pd.to_datetime(linktt['aproxlinkstart'], format = '%Y-%m-%d %H:%M:%S')
    linktt['day'] = pd.to_datetime(linktt['aproxlinkstart'], format = '%Y-%m-%d')

    linktt = linktt.loc[(linktt['weekday'] != 1) | (linktt['weekday'] != 7)]
    
#     for iterated_link in range(0, len(linesedge[linhas[line]])):
        

In [6]:
link = 5

len(linesedge[linhas[line]])


54

In [20]:
link = 3

# for link in range(2, len(linesedge[linhas[line]])-3):

linktt_k_neighbor = linktt.loc[((linktt['link'] == str(linesedge[linhas[line]][link-2]))   |
                      (linktt['link'] == str(linesedge[linhas[line]][link - 1])) |
                      (linktt['link'] == str(linesedge[linhas[line]][link]))   |
                      (linktt['link'] == str(linesedge[linhas[line]][link+1])) |
                      (linktt['link'] == str(linesedge[linhas[line]][link+2])) 
                      )]

q1, q3 = np.percentile(linktt_k_neighbor['ltt'],[25,75])
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr) 
# lower_bound = 10
upper_bound = q3 + (1.5 * iqr)  

frequency = '30min'            
rolling_win = 1

linktt_k_neighbor.index = linktt_k_neighbor['aproxlinkstart']

linktt_k_neighbor = linktt_k_neighbor.loc[(linktt_k_neighbor['ltt'] >= lower_bound) & 
                                          (linktt_k_neighbor['ltt'] <= upper_bound)] 

linktt_k_neighbor['day'] = pd.to_datetime(np.array(linktt_k_neighbor['aproxlinkstart'])).day

df_temp_grouped  = linktt_k_neighbor.groupby([pd.Grouper(freq=str(frequency)), 'link'], as_index=True ).mean()['ltt'].unstack()\
                .rolling(rolling_win).mean().transform(lambda x: x.fillna(method='ffill')).dropna().reset_index()

df_temp_grouped['aproxlinkstart'] = pd.to_datetime(df_temp_grouped['aproxlinkstart'], format = '%Y-%m-%d %H:%M:%S')

df_temp_grouped['aproxlinkstart'].astype('datetime64[M]').astype(int) % 12 + 1

df_temp_grouped['month'] = pd.to_datetime(np.array(df_temp_grouped['aproxlinkstart'])).month
df_temp_grouped['day'] = pd.to_datetime(np.array(df_temp_grouped['aproxlinkstart'])).day
df_temp_grouped['hour'] = pd.to_datetime(np.array(df_temp_grouped['aproxlinkstart'])).hour
df_temp_grouped['minute'] = pd.to_datetime(np.array(df_temp_grouped['aproxlinkstart'])).minute
df_temp_grouped['weekday'] = df_temp_grouped['aproxlinkstart'].dt.strftime('%w')


ML_data_train = df_temp_grouped.loc[(df_temp_grouped['aproxlinkstart'] < end_date)]
ML_data_test = df_temp_grouped.loc[(df_temp_grouped['aproxlinkstart'] >= end_date)]

train_x = ML_data_train.drop(['aproxlinkstart', str(linesedge[linhas[line]][link])], axis=1)
train_y = ML_data_train[str(linesedge[linhas[line]][link])]

test_x = ML_data_test.drop(['aproxlinkstart', str(linesedge[linhas[line]][link])], axis=1)
test_y = ML_data_test[str(linesedge[linhas[line]][link])]

model = LinearRegression()
model_tree = tree.DecisionTreeRegressor()
model_lasso = Lasso(alpha=0.1)  


model.fit(train_x, train_y)    

model_tree.fit(train_x, train_y)    

model_lasso.fit(train_x, train_y)    


MAPE_day = []

for select_day in linktt_k_neighbor.loc[linktt_k_neighbor['aproxlinkstart'] >= end_date].aproxlinkstart.dt.strftime('%d').unique():


    df_temp = linktt_k_neighbor.loc[
                                    (linktt_k_neighbor['day'] == int(select_day)) &
                                    (linktt_k_neighbor['aproxlinkstart'] >= end_date) & 
                                    (linktt_k_neighbor['link'] == str(linesedge[linhas[line]][link]))
                                   ]

    for hour_day in range(480, 1260, 30):   
        int_part, float_part = divmod((hour_day/60), 1)

        if float_part != 0:
            hour_begin = str(int(int_part)) + ':' + '30'
            hour_end = str(int(int_part + 1 )) + ':' + '00'
            hour_test = str(int(int_part + 1 )) + ':' + '30'
            hour_future = str(int(int_part + 2 )) + ':' + '00'

        else:
            hour_begin = str(int(int_part)) + ':' + '00'
            hour_end = str(int(int_part)) + ':' + '30'
            hour_test = str(int(int_part + 1 )) + ':' + '00'
            hour_future = str(int(int_part + 1 )) + ':' + '30'


        if len(test_x.loc[(test_x['day'] == int(select_day)) & 
                                       (test_x['hour'] == int(int_part)) & 
                                       (test_x['minute'] == int(float_part*60))]) > 0:

            median_test_lm = model.predict(test_x.loc[(test_x['day'] == int(select_day)) & 
                                                   (test_x['hour'] == int(int_part)) & 
                                                       (test_x['minute'] == int(float_part*60))])

            median_test_lasso = model_lasso.predict(test_x.loc[(test_x['day'] == int(select_day)) & 
                                                   (test_x['hour'] == int(int_part)) & 
                                                       (test_x['minute'] == int(float_part*60))])    

            median_test_tree = model_tree.predict(test_x.loc[(test_x['day'] == int(select_day)) & 
                                                   (test_x['hour'] == int(int_part)) & 
                                                       (test_x['minute'] == int(float_part*60))])    





            df_temp_time = df_temp.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]
            df_temp_test = df_temp.iloc[df_temp.index.indexer_between_time( hour_end, hour_test)]
            df_temp_future = df_temp.iloc[df_temp.index.indexer_between_time(hour_test, hour_future)]

            ### Temporal Link travel time used in the median (us) 
            temp_travels_time = df_temp_time['ltt']
            time_tests        = df_temp_test['ltt']
            time_future       = df_temp_future['ltt']

            median_test = np.mean(temp_travels_time)                

            if ((len(temp_travels_time) > 0) & (math.isnan(median_test) == False)):
                MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), 
                                 median_test, 
                                 mean_absolute_percentage_error([median_test]*len(temp_travels_time), temp_travels_time), 
                                 str(np.array(temp_travels_time)), 
                                 'Actual'])

                MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), 
                                 median_test, 
                                 mean_absolute_percentage_error([median_test_lm]*len(temp_travels_time), temp_travels_time), 
                                 str(np.array(temp_travels_time)), 
                                 'Actual_lm'])

                MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), 
                                 median_test, 
                                 mean_absolute_percentage_error([median_test_lasso]*len(temp_travels_time), temp_travels_time), 
                                 str(np.array(temp_travels_time)), 
                                 'Actual_lasso'])

                MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), 
                                 median_test, 
                                 mean_absolute_percentage_error([median_test_tree]*len(temp_travels_time), temp_travels_time), 
                                 str(np.array(temp_travels_time)), 
                                 'Actual_tree'])



                MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), 
                                 median_test,
                                 mean_absolute_percentage_error([median_test]*len(time_tests), time_tests), 
                                 str(np.array(time_tests)), 
                                 'After'])

                MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), 
                                 median_test,
                                 mean_absolute_percentage_error([median_test_lm]*len(time_tests), time_tests), 
                                 str(np.array(time_tests)), 
                                 'After_lm'])

                MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), 
                                 median_test,
                                 mean_absolute_percentage_error([median_test_lasso]*len(time_tests), time_tests), 
                                 str(np.array(time_tests)), 
                                 'After_lasso'])

                MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), 
                                 median_test,
                                 mean_absolute_percentage_error([median_test_tree]*len(time_tests), time_tests), 
                                 str(np.array(time_tests)), 
                                 'After_tree'])

                MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'Future'])

                MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                     mean_absolute_percentage_error([median_test_lm]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'Future_lm'])

                MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                     mean_absolute_percentage_error([median_test_lasso]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'Future_lasso'])

                MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                     mean_absolute_percentage_error([median_test_tree]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'Future_tree'])


data = pd.DataFrame(MAPE_day,columns=['line', 'link', 'day', 'time', 'N-travel', 'median', 'MAPE', 
                                  'travels_time', 'errors'])

# data = data.loc[data['MAPE'] != 0] 
       
for i in data['errors'].unique():
    print(i + ' : ' + str(data.loc[data['errors'] == i]['MAPE'].mean()))
test_x

Actual : 23.133641499028954
Actual_lm : 49.000325000223775
Actual_lasso : 48.886303233680884
Actual_tree : 49.506905022086826
After : 109.19871145157391
After_lm : 64.71112334072787
After_lasso : 64.49574595483963
After_tree : 54.78719941322732
Future : 110.91144915788956
Future_lm : 60.45944922974249
Future_lasso : 60.26513744867034
Future_tree : 77.61535630238826


link,"(3877, 3878)","(3878, 3879)","(3880, 3881)","(3881, 3882)",month,day,hour,minute,weekday
4716,72.785186,38.348189,58.012639,51.759326,9,28,6,0,4
4717,75.804635,61.453934,41.760339,64.956382,9,28,6,30,4
4718,41.327141,43.781308,33.350207,58.118819,9,28,7,0,4
4719,59.863459,50.49241,42.922248,34.708962,9,28,7,30,4
4720,75.404054,85.699131,91.617401,35.475569,9,28,8,0,4
4721,56.655954,26.786062,53.978631,44.715889,9,28,8,30,4
4722,79.985665,71.076548,48.153974,52.060069,9,28,9,0,4
4723,89.784409,44.924976,42.30679,55.956169,9,28,9,30,4
4724,76.18556,30.502366,30.344092,38.1516,9,28,10,0,4
4725,55.817382,30.502366,66.568062,56.321714,9,28,10,30,4


In [91]:

MAPE_day = []

for select_day in linktt.loc[linktt['aproxlinkstart'] >= end_date].aproxlinkstart.dt.strftime('%Y-%m-%d').unique():

    for line in range(0,5):
    #     for link in range(1, len(linesedge[linhas[line]])-2):
        for link in range(2, 7):
            df_temp = linktt.loc[(linktt['aproxlinkstart'] >= select_day) & 
                                 (linktt['link'] == str(linesedge[linhas[line]][link]))
                                ]

            travels_day = np.unique(np.array(df_temp['travel_id']))

            for hour_day in range(480, 1260, 30):   
                int_part, float_part = divmod((hour_day/60), 1)

                if float_part != 0:
                    hour_begin = str(int(int_part)) + ':' + '30'
                    hour_end = str(int(int_part + 1 )) + ':' + '00'
                    hour_test = str(int(int_part + 1 )) + ':' + '30'
                    hour_future = str(int(int_part + 2 )) + ':' + '00'
                else:
                    hour_begin = str(int(int_part)) + ':' + '00'
                    hour_end = str(int(int_part)) + ':' + '30'
                    hour_test = str(int(int_part + 1 )) + ':' + '00'
                    hour_future = str(int(int_part + 1 )) + ':' + '30'

                df_temp_time = df_temp.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]
                df_temp_test = df_temp.iloc[df_temp.index.indexer_between_time( hour_end, hour_test)]
                df_temp_future = df_temp.iloc[df_temp.index.indexer_between_time(hour_test, hour_future)]

                ### Temporal Link travel time used in the median (us) 
                temp_travels_time = df_temp_time['ltt']
                time_tests        = df_temp_test['ltt']
                time_future       = df_temp_future['ltt']


                median_test = np.median(temp_travels_time)                

                if ((len(temp_travels_time) > 0) & (math.isnan(median_test) == False)):
                    MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), 
                                     median_test, 
                                     mean_absolute_percentage_error([median_test]*len(temp_travels_time), temp_travels_time), 
                                     str(np.array(temp_travels_time)), 
                                     'Actual'])

                    MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), 
                                     median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_tests), time_tests), 
                                     str(np.array(time_tests)), 
                                     'After'])

                    MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                         mean_absolute_percentage_error([median_test]*len(time_future), time_future), 
                                         str(np.array(time_future)), 'Future'])

                
data = pd.DataFrame(MAPE_day,columns=['line', 'link', 'day', 'time', 'N-travel', 'median', 'MAPE', 
                                      'travels_time', 'errors'])
# data = data.loc[data['MAPE'] != 0] 

pd.DataFrame(data).to_html('html/data-30M_median_'  + select_day + '.html')  
        
# data = data.loc[data['MAPE'] != 0]

# data.loc[(data['errors'] != 'median_LR_after') & (data['errors'] != 'median_Tree_after') &
#         (data['errors'] != 'median_LR_future') & (data['errors'] != 'median_Tree_future')]

fig = (
        p9.ggplot(data, p9.aes(x = 'errors', y ='MAPE', group='errors', colors='errors')) +
        p9.geom_boxplot( outlier_shape = "") +
        p9.scale_y_continuous(limits =  (0, 50)) +        
        p9.stat_boxplot(geom ='errorbar')  +        
        p9.theme_bw() +
        p9.theme(figure_size = (8, 12)) +
        p9.facet_wrap('~line', ncol = 3) +
        p9.labs(title = 'MAPE of the lines in the day ' + select_day ) +
        p9.theme(figure_size = (9, 12)) +
        p9.theme(axis_text_x = p9.element_text(angle=90)) 
    )
p9.ggsave(fig, 'png/Boxplot' + '_' + select_day + '_30M_median.png')  
p9.ggsave(fig, 'pdf/Boxplot' + '_' + select_day + '_30M_median.pdf')  


for i in data['errors'].unique():
    print(i + ' : ' + str(data.loc[data['errors'] == i]['MAPE'].mean()))


Actual : 18.656629742145
After : 31.520942568320386
Future : 23130.55083753382


# MAPE with some conditions about the states statistics


In [1]:

MAPE_day = []
for line in range(0,1):    
    for select_day in linktt.loc[linktt['aproxlinkstart'] >= end_date].aproxlinkstart.dt.strftime('%Y-%m-%d').unique():

        for link in range(1, len(linesedge[linhas[line]])-2):
    #     for link in range(2, 7):
            df_temp = linktt.loc[(linktt['aproxlinkstart'] < select_day) & 
                                 (linktt['link'] == str(linesedge[linhas[line]][link]))
                                ]

            df_temp_historic = linktt.loc[(linktt['aproxlinkstart'] < select_day) & 
                                 (linktt['link'] == str(linesedge[linhas[line]][link]))
                                         ]

            travels_day = np.unique(np.array(df_temp['travel_id']))

            for hour_day in range(480, 1260, 30):   
                int_part, float_part = divmod((hour_day/60), 1)

                if float_part != 0:
                    hour_begin = str(int(int_part)) + ':' + '30'
                    hour_end = str(int(int_part + 1 )) + ':' + '00'
                    hour_test = str(int(int_part + 1 )) + ':' + '30'
                    hour_future = str(int(int_part + 2 )) + ':' + '00'
                else:
                    hour_begin = str(int(int_part)) + ':' + '00'
                    hour_end = str(int(int_part)) + ':' + '30'
                    hour_test = str(int(int_part + 1 )) + ':' + '00'
                    hour_future = str(int(int_part + 1 )) + ':' + '30'

                temp_travels_time = df_temp.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]['ltt']
                temp_travels_historic = df_temp_historic.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]['ltt']

                time_tests = df_temp.iloc[df_temp.index.indexer_between_time( hour_end, hour_test)]['ltt']
                time_future = df_temp.iloc[df_temp.index.indexer_between_time(hour_test, hour_future)]['ltt']

                median_test = np.median(temp_travels_time)    

                median_historic = np.median(temp_travels_historic)    
                mean_historic = np.mean(temp_travels_historic)    
                std_historic = np.std(temp_travels_historic)   

                q1, q3 = np.percentile(temp_travels_historic,[25,75])
                iqr = q3 - q1
                lower_bound = q1 - (1.5 * iqr) 
                upper_bound = q3 + (1.5 * iqr)    

                if ((len(temp_travels_time) > 0) & (math.isnan(median_test) == False)):


                    MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), 
                                     median_test, 
                                     mean_absolute_percentage_error([median_test]*len(temp_travels_time), temp_travels_time), 
                                     str(np.array(temp_travels_time)), 
                                     'Actual'])


                    temp_travels_time_filter = temp_travels_time[(temp_travels_time > lower_bound) & (temp_travels_time < upper_bound)]

                    if len(temp_travels_time_filter) > 0:
                        MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time_filter), 
                                     median_test, 
                                     mean_absolute_percentage_error([median_historic]*len(temp_travels_time_filter), temp_travels_time_filter), 
                                     str(np.array(temp_travels_time_filter)), 
                                     'Actual_historic'])               


                    MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), 
                                     median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_tests), time_tests), 
                                     str(np.array(time_tests)), 
                                     'After'])

                    MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                         mean_absolute_percentage_error([median_test]*len(time_future), time_future), 
                                         str(np.array(time_future)), 'Future'])

                
data = pd.DataFrame(MAPE_day,columns=['line', 'link', 'day', 'time', 'N-travel', 'median', 'MAPE', 
                                      'travels_time', 'errors'])
# data = data.loc[data['MAPE'] != 0] 

pd.DataFrame(data).to_html('html/data-30M_median_'  + select_day + '.html')  
        
# data = data.loc[data['MAPE'] != 0]

# data.loc[(data['errors'] != 'median_LR_after') & (data['errors'] != 'median_Tree_after') &
#         (data['errors'] != 'median_LR_future') & (data['errors'] != 'median_Tree_future')]

fig = (
        p9.ggplot(data, p9.aes(x = 'errors', y ='MAPE', group='errors', colors='errors')) +
        p9.geom_boxplot( outlier_shape = "") +
        p9.scale_y_continuous(limits =  (0, 50)) +        
        p9.stat_boxplot(geom ='errorbar')  +        
        p9.theme_bw() +
        p9.theme(figure_size = (8, 12)) +
        p9.facet_wrap('~line', ncol = 3) +
        p9.labs(title = 'MAPE of the lines in the day ' + select_day ) +
        p9.theme(figure_size = (9, 12)) +
        p9.theme(axis_text_x = p9.element_text(angle=90)) 
    )
p9.ggsave(fig, 'png/Boxplot' + '_' + select_day + '_30M_median.png')  
p9.ggsave(fig, 'pdf/Boxplot' + '_' + select_day + '_30M_median.pdf')  

for i in data['errors'].unique():
    print(i + ' : ' + str(data.loc[data['errors'] == i]['MAPE'].mean()))


NameError: name 'linktt' is not defined

In [80]:
temp_travels_historic

Series([], Name: ltt, dtype: float64)

In [99]:

MAPE_day = []
    
for select_day in linktt.loc[linktt['aproxlinkstart'] >= end_date].aproxlinkstart.dt.strftime('%Y-%m-%d').unique():
    
    for link in range(1, len(linesedge[linhas[line]])-2):
#     for link in range(2, 7):
        df_temp = linktt.loc[(linktt['aproxlinkstart'] >= select_day) & 
                             (linktt['link'] == str(linesedge[linhas[line]][link]))
                            ]
    
        travels_day = np.unique(np.array(df_temp['travel_id']))

        for hour_day in range(480, 1260, 30):   
            int_part, float_part = divmod((hour_day/60), 1)

            if float_part != 0:
                hour_begin = str(int(int_part)) + ':' + '30'
                hour_end = str(int(int_part + 1 )) + ':' + '00'
                hour_test = str(int(int_part + 1 )) + ':' + '30'
                hour_future = str(int(int_part + 2 )) + ':' + '00'
            else:
                hour_begin = str(int(int_part)) + ':' + '00'
                hour_end = str(int(int_part)) + ':' + '30'
                hour_test = str(int(int_part + 1 )) + ':' + '00'
                hour_future = str(int(int_part + 1 )) + ':' + '30'

            df_temp_time = df_temp.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]
            df_temp_test = df_temp.iloc[df_temp.index.indexer_between_time( hour_end, hour_test)]
            df_temp_future = df_temp.iloc[df_temp.index.indexer_between_time(hour_test, hour_future)]
            
            ### Temporal Link travel time used in the median (us) 
            temp_travels_time = df_temp_time['ltt']
            time_tests        = df_temp_test['ltt']
            time_future       = df_temp_future['ltt']
            

            median_test = np.mean(temp_travels_time)                
           
            if ((len(temp_travels_time) > 0) & (math.isnan(median_test) == False)):
                MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), 
                                 median_test, 
                                 mean_absolute_percentage_error([median_test]*len(temp_travels_time), temp_travels_time), 
                                 str(np.array(temp_travels_time)), 
                                 'Actual'])
                    
                MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), 
                                 median_test,
                                 mean_absolute_percentage_error([median_test]*len(time_tests), time_tests), 
                                 str(np.array(time_tests)), 
                                 'After'])
                    
                MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'Future'])

                
data = pd.DataFrame(MAPE_day,columns=['line', 'link', 'day', 'time', 'N-travel', 'median', 'MAPE', 
                                      'travels_time', 'errors'])
# data = data.loc[data['MAPE'] != 0] 

pd.DataFrame(data).to_html('html/data-30M_mean_'  + select_day + '.html')  
        
# data = data.loc[data['MAPE'] != 0]

# data.loc[(data['errors'] != 'median_LR_after') & (data['errors'] != 'median_Tree_after') &
#         (data['errors'] != 'median_LR_future') & (data['errors'] != 'median_Tree_future')]

fig = (
        p9.ggplot(data, p9.aes(x = 'errors', y ='MAPE', group='errors', colors='errors')) +
        p9.geom_boxplot( outlier_shape = "") +
#         p9.scale_y_continuous(limits =  (0, 50)) +        
        p9.stat_boxplot(geom ='errorbar')  +        
        p9.theme_bw() +
        p9.theme(figure_size = (8, 12)) +
        p9.facet_wrap('~line', ncol = 3) +
        p9.labs(title = 'MAPE of the lines in the day ' + select_day ) +
        p9.theme(figure_size = (9, 12)) +
        p9.theme(axis_text_x = p9.element_text(angle=90)) 
    )
p9.ggsave(fig, 'png/Boxplot' + '_' + select_day + '_30M_mean.png')  
p9.ggsave(fig, 'pdf/Boxplot' + '_' + select_day + '_30M_mean.pdf')  


for i in data['errors'].unique():
    print(i + ' : ' + str(data.loc[data['errors'] == i]['MAPE'].mean()))


Actual : 31.874030015372625
After : 291.6129596409533
Future : 189.0730073025666


# Using the Median and Mean of $n$ buses in a period of 60 minutes

In [101]:

MAPE_day = []
    
for select_day in linktt.loc[linktt['aproxlinkstart'] >= end_date].aproxlinkstart.dt.strftime('%Y-%m-%d').unique():
    
    for link in range(1, len(linesedge[linhas[line]])-2):
#     for link in range(2, 7):
        df_temp = linktt.loc[(linktt['aproxlinkstart'] >= select_day) & 
                             (linktt['link'] == str(linesedge[linhas[line]][link]))
                            ]
    
        travels_day = np.unique(np.array(df_temp['travel_id']))

        for hour_day in range(640, 1200, 60):   
            int_part, float_part = divmod((hour_day/60), 1)

            hour_begin = str(int(int_part)) + ':' + '00'
            hour_end = str(int(int_part + 1 )) + ':' + '00'
            hour_test = str(int(int_part +  2)) + ':' + '00'
            hour_future = str(int(int_part +  3)) + ':' + '00'
                
            df_temp_time = df_temp.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]
            df_temp_test = df_temp.iloc[df_temp.index.indexer_between_time( hour_end, hour_test)]
            df_temp_future = df_temp.iloc[df_temp.index.indexer_between_time(hour_test, hour_future)]
            
            ### Temporal Link travel time used in the median (us) 
            temp_travels_time = df_temp_time['ltt']
            time_tests        = df_temp_test['ltt']
            time_future       = df_temp_future['ltt']
            

            median_test = np.median(temp_travels_time)                
           
            if ((len(temp_travels_time) > 0) & (math.isnan(median_test) == False)):
                MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), 
                                 median_test, 
                                 mean_absolute_percentage_error([median_test]*len(temp_travels_time), temp_travels_time), 
                                 str(np.array(temp_travels_time)), 
                                 'Actual'])
                    
                MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), 
                                 median_test,
                                 mean_absolute_percentage_error([median_test]*len(time_tests), time_tests), 
                                 str(np.array(time_tests)), 
                                 'After'])
                    
                MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'Future'])

                
data = pd.DataFrame(MAPE_day,columns=['line', 'link', 'day', 'time', 'N-travel', 'median', 'MAPE', 
                                      'travels_time', 'errors'])
# data = data.loc[data['MAPE'] != 0] 

pd.DataFrame(data).to_html('html/data-60M_median_'  + select_day + '.html')  
        
# data = data.loc[data['MAPE'] != 0]

# data.loc[(data['errors'] != 'median_LR_after') & (data['errors'] != 'median_Tree_after') &
#         (data['errors'] != 'median_LR_future') & (data['errors'] != 'median_Tree_future')]

fig = (
        p9.ggplot(data, p9.aes(x = 'errors', y ='MAPE', group='errors', colors='errors')) +
        p9.geom_boxplot( outlier_shape = True) +
#         p9.scale_y_continuous(limits =  (0, 50)) +        
        p9.stat_boxplot(geom ='errorbar')  +        
        p9.theme_bw() +
        p9.theme(figure_size = (8, 12)) +
        p9.facet_wrap('~line', ncol = 3) +
        p9.labs(title = 'MAPE of the lines in the day ' + select_day ) +
        p9.theme(figure_size = (9, 12)) +
        p9.theme(axis_text_x = p9.element_text(angle=90)) 
    )
p9.ggsave(fig, 'png/Boxplot' + '_' + select_day + '_60M_median.png')  
p9.ggsave(fig, 'pdf/Boxplot' + '_' + select_day + '_60M_median.pdf')  


for i in data['errors'].unique():
    print(i + ' : ' + str(data.loc[data['errors'] == i]['MAPE'].mean()))


Actual : 74.17979711597515
After : 240.39314054730067
Future : 228.67733402364792


In [102]:

MAPE_day = []
    
for select_day in linktt.loc[linktt['aproxlinkstart'] >= end_date].aproxlinkstart.dt.strftime('%Y-%m-%d').unique():
    
    for link in range(1, len(linesedge[linhas[line]])-2):
#     for link in range(2, 7):
        df_temp = linktt.loc[(linktt['aproxlinkstart'] >= select_day) & 
                             (linktt['link'] == str(linesedge[linhas[line]][link]))
                            ]
    
        travels_day = np.unique(np.array(df_temp['travel_id']))

        for hour_day in range(640, 1200, 60):   
            int_part, float_part = divmod((hour_day/60), 1)

            hour_begin = str(int(int_part)) + ':' + '00'
            hour_end = str(int(int_part + 1 )) + ':' + '00'
            hour_test = str(int(int_part +  2)) + ':' + '00'
            hour_future = str(int(int_part +  3)) + ':' + '00'
                
            df_temp_time = df_temp.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]
            df_temp_test = df_temp.iloc[df_temp.index.indexer_between_time( hour_end, hour_test)]
            df_temp_future = df_temp.iloc[df_temp.index.indexer_between_time(hour_test, hour_future)]
            
            ### Temporal Link travel time used in the median (us) 
            temp_travels_time = df_temp_time['ltt']
            time_tests        = df_temp_test['ltt']
            time_future       = df_temp_future['ltt']
            

            median_test = np.mean(temp_travels_time)                
           
            if ((len(temp_travels_time) > 0) & (math.isnan(median_test) == False)):
                MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), 
                                 median_test, 
                                 mean_absolute_percentage_error([median_test]*len(temp_travels_time), temp_travels_time), 
                                 str(np.array(temp_travels_time)), 
                                 'Actual'])
                    
                MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), 
                                 median_test,
                                 mean_absolute_percentage_error([median_test]*len(time_tests), time_tests), 
                                 str(np.array(time_tests)), 
                                 'After'])
                    
                MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'Future'])

                
data = pd.DataFrame(MAPE_day,columns=['line', 'link', 'day', 'time', 'N-travel', 'median', 'MAPE', 
                                      'travels_time', 'errors'])
# data = data.loc[data['MAPE'] != 0] 

pd.DataFrame(data).to_html('html/data-60M_mean_'  + select_day + '.html')  
        
# data = data.loc[data['MAPE'] != 0]

# data.loc[(data['errors'] != 'median_LR_after') & (data['errors'] != 'median_Tree_after') &
#         (data['errors'] != 'median_LR_future') & (data['errors'] != 'median_Tree_future')]

fig = (
        p9.ggplot(data, p9.aes(x = 'errors', y ='MAPE', group='errors', colors='errors')) +
        p9.geom_boxplot( outlier_shape = True) +
#         p9.scale_y_continuous(limits =  (0, 50)) +        
        p9.stat_boxplot(geom ='errorbar')  +        
        p9.theme_bw() +
        p9.theme(figure_size = (8, 12)) +
        p9.facet_wrap('~line', ncol = 3) +
        p9.labs(title = 'MAPE of the lines in the day ' + select_day ) +
        p9.theme(figure_size = (9, 12)) +
        p9.theme(axis_text_x = p9.element_text(angle=90)) 
    )
p9.ggsave(fig, 'png/Boxplot' + '_' + select_day + '_60M_mean.png')  
p9.ggsave(fig, 'pdf/Boxplot' + '_' + select_day + '_60M_mean.pdf')  


for i in data['errors'].unique():
    print(i + ' : ' + str(data.loc[data['errors'] == i]['MAPE'].mean()))


Actual : 45.71495103280889
After : 203.09006365767027
Future : 211.89660631933498
