# Using the Median of $n$ buses in a period of 30 minutes

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import heapq 
import pickle
%matplotlib inline
    
import plotnine as p9
import matplotlib.pyplot as plt
import dateutil
import datetime 
from time import time
import re
from scipy import interpolate
import timeit
from dateutil.rrule import DAILY, rrule, MO, TU, WE, TH, FR
import seaborn as sns

from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge
from sklearn import tree
import math

import sys
sys.path.append("../")


from historical.readData.old.estimateData3 import read

from historical.readData.old.estimateData3 import read
from historical.readData.old.estimateData3 import search_travels
from historical.readData.old.estimateData3 import estimate
from historical.readData.old.estimateData3 import stops_distance

from historical.readData.old.travels3 import haversine2

def calcula_dist_shape(selec_linhas):
    distance_all_shapes = {}
    # print 'Calculando distancias dos shapes'
    '''calcula todas as distancias dos shapes'''
    
    pth_files_GTFS = "../data/gtfs/"
        
    trips = pd.read_csv(pth_files_GTFS + 'trips.txt', sep=',')
    shapes = pd.read_csv(pth_files_GTFS + 'shapes.txt', sep=',')
    
    for l, trip_id in enumerate(selec_linhas):

        trip = trips[trips.trip_id == trip_id]
        trip_shape = shapes[shapes['shape_id'].isin(trip['shape_id'])]
        shapelat = trip_shape.shape_pt_lat.tolist()
        shapelon = trip_shape.shape_pt_lon.tolist()

        # distancias dos shapes
        lon1 = shapelon[0]
        lat1 = shapelat[0]
        totalcal = [0.]
        dist = [0.]
        for lat2, lon2 in zip(shapelat[1:], shapelon[1:]):
            d = haversine2(lat1, lon1, lat2, lon2)
            d = d * 1000
            dist.append(d)
            totalcal.append(totalcal[-1] + d)
            lat1 = lat2
            lon1 = lon2
        distance_all_shapes[trip_id] = [shapelat, shapelon, totalcal]
    return distance_all_shapes

def stops_distance(linha):

    distances = calcula_dist_shape([linha])
    totalcal = distances[linha][2]
    
    pth_files_GTFS = "../data/gtfs/"
    

    trips = pd.read_csv(pth_files_GTFS + 'trips.txt', sep=',')
    shapes = pd.read_csv(pth_files_GTFS + 'shapes.txt', sep=',')
    stops = pd.read_csv(pth_files_GTFS + 'stops.txt', sep=',')
    stopid = pd.read_csv(pth_files_GTFS + 'stop_times.txt', sep=',')   

    ida = trips[trips.trip_id == linha]
    shapeida = shapes[shapes['shape_id'].isin(ida['shape_id'])]
    idalat = shapeida.shape_pt_lat.tolist()
    idalon = shapeida.shape_pt_lon.tolist()
    total = shapeida.shape_dist_traveled.tolist()

    temp1 = stopid[stopid.trip_id == linha]
    stopsida = stops[stops['stop_id'].isin(temp1['stop_id'])]
    stopsida = stopsida.set_index('stop_id')
    stopsida = stopsida.reindex(index=temp1['stop_id'])

    stopslat = stopsida.stop_lat.tolist()
    stopslon = stopsida.stop_lon.tolist()

    dpontos = [None] * len(stopslat)
    index = 0
    lat = idalat
    lon = idalon
    total = 0
    p = ['depois'] * len(stopslat)
    for latb, lonb, i in zip(stopslat, stopslon, range(len(stopslat))):
        lat = lat[index:]
        lon = lon[index:]
        nn = haversine2(latb, lonb, np.array(lat), np.array(lon)) * 1000
        index = nn.argmin()
        total = total + index
        if index == 0:
            dpontos[i] = nn[index]
#            descontardist = nn[index]
        else:
            if totalcal[total] >= totalcal[total - 1] + nn[index - 1]:
                p[i] = 'antes'
            dpontos[i] = totalcal[total - 1] + nn[index - 1]

    mid_points = []
#    mid_points.append(200)
    p1 = dpontos[0]
    for p2 in dpontos[1:]:
        mid_points.append(p1 + (p2-p1)/2)
        p1 = p2
    dpontos = [i/1000. for i in dpontos]
    mid_points = [i/1000. for i in mid_points]

    return dpontos, mid_points

def read(filedata,filerep):
    df0 = pd.read_pickle(filedata, compression=None)
    with open(filerep, 'rb') as handle:
        reps = pickle.load(handle, encoding='latin1')
    return df0, reps

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + datetime.timedelta(n)
        
def daterangeWD(start_date, end_date):
  return rrule(DAILY, dtstart=start_date, until=end_date, byweekday=(MO,TU,WE,TH,FR))

def hr_func(ts):
    return ts.hour

def minute_func(ts):
    return ts.minute

def second_func(ts):
    return ts.second


def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

import warnings
warnings.filterwarnings('ignore')

%matplotlib agg 

pth_files_GTFS = "../data/gtfs/"

trips = pd.read_csv(pth_files_GTFS + 'trips.txt', sep=',')
shapes = pd.read_csv(pth_files_GTFS + 'shapes.txt', sep=',')
stops = pd.read_csv(pth_files_GTFS + 'stops.txt', sep=',')
stopid = pd.read_csv(pth_files_GTFS + 'stop_times.txt', sep=',')   

periods = ['morning', 'm_peak', 'i_peak', 'a_peak', 'night']


In [2]:
selected_line = ['8700-10-1']

selected_line = [
                '8700-10-1',                
                '7545-10-1',
                '7545-10-0',
                '6450-10-1',
                '6450-10-0',
                '3301-10-1',
#                 '3301-10-0',
                '2290-10-1',
                '2290-10-0',
#                 '574J-10-1',
#                 '574J-10-0',
#                 '477P-10-1',
                '477P-10-0',
#                 '351F-10-1',
#                 '351F-10-0'
                ]
selected_line = ['6450-10-0']

MAPE_day = []

for line in selected_line:
#     line = selected_line[0]
    p, mp = stops_distance(line)
    pth_files_lines = "../historical/readData/"

    ### Operação demorada
    df, reps = read(pth_files_lines + "trips_" + line + ".dsk", pth_files_lines + "interps_" + line + ".rep")

In [5]:
df

Unnamed: 0,car,numtravel,day,weekday,holiday,start,travel_time,distance,time,period
0,72365\r,0,2017-01-01,6,1,11:12:19,52.35,0.05535,0.000000,i_peak
1,72365\r,0,2017-01-01,6,1,11:12:19,52.35,0.11812,1.650000,i_peak
2,72365\r,0,2017-01-01,6,1,11:12:19,52.35,0.17883,3.316667,i_peak
3,72365\r,0,2017-01-01,6,1,11:12:19,52.35,0.50429,4.066667,i_peak
4,72365\r,0,2017-01-01,6,1,11:12:19,52.35,0.58227,4.150000,i_peak
5,72365\r,0,2017-01-01,6,1,11:12:19,52.35,0.96159,4.666667,i_peak
6,72365\r,0,2017-01-01,6,1,11:12:19,52.35,1.01292,4.733333,i_peak
7,72365\r,0,2017-01-01,6,1,11:12:19,52.35,1.48960,5.400000,i_peak
8,72365\r,0,2017-01-01,6,1,11:12:19,52.35,1.62704,6.066667,i_peak
9,72365\r,0,2017-01-01,6,1,11:12:19,52.35,2.03243,7.033333,i_peak


In [4]:
data = pd.DataFrame(MAPE_day,columns=['line', 'link', 'day', 'time', 'N-travel', 'median', 'MAPE', 'travels_time', 'errors'])
pd.DataFrame(data).to_html('html/data-30M_median_' + select_day + '.html')  
        
data = data.loc[data['MAPE'] != 0] 
# data = data.loc[(data['errors'] != 'median_LR_after') & (data['errors'] != 'median_Tree_after') &
#         (data['errors'] != 'median_LR_future') & (data['errors'] != 'median_Tree_future')]
fig = (
        p9.ggplot(data, p9.aes(x = 'errors', y ='MAPE', group='errors', colors='errors')) +
        p9.geom_boxplot( outlier_shape = "") +
        p9.scale_y_continuous(limits =  (0, 50)) +        
        p9.stat_boxplot(geom ='errorbar')  +        
        p9.theme_bw() +
        p9.theme(figure_size = (8, 12)) +
        p9.facet_wrap('~line', ncol = 3) +
        p9.labs(title = 'MAPE of the lines in the day ' + select_day ) +
        p9.theme(figure_size = (18, 24))  +
        p9.theme(axis_text_x = p9.element_text(angle=90)) 
    )
p9.ggsave(fig, 'png/Boxplot' + '_' + select_day + '_30M_median.png')  
p9.ggsave(fig, 'pdf/Boxplot' + '_' + select_day + '_30M_median.pdf')

for i in data['errors'].unique():
    print(i + ': ' + str(data.loc[data['errors'] == i]['MAPE'].mean()))

Actual: 8.745420275313107
After: 13.983902250267427
median_LR_after: 12.069013044826184
median_Tree_after: 17.433925172530618
median_Lasso_after: 12.54823803523942
Future: 14.555437910966928
median_LR_future: 10.217191442885868
median_Tree_future: 15.34808290083754
median_Lasso_future: 12.294728662441758


# Using the Median of $n$ buses in a period of 60 minutes

In [11]:
selected_line = ['8700-10-1']

'8700-10-0',
selected_line = ['8700-10-1',                
                '7545-10-1',
                '7545-10-0',
                '6450-10-1',
                '6450-10-0',
                '3301-10-1',
#                 '3301-10-0',
                '2290-10-1',
                '2290-10-0',
#                 '574J-10-1',
#                 '574J-10-0',
#                 '477P-10-1',
                '477P-10-0',
#                 '351F-10-1',
#                 '351F-10-0'
                ]

selected_line = ['6450-10-0']

pth_files_GTFS = "../data/gtfs/"

trips = pd.read_csv(pth_files_GTFS + 'trips.txt', sep=',')
shapes = pd.read_csv(pth_files_GTFS + 'shapes.txt', sep=',')
stops = pd.read_csv(pth_files_GTFS + 'stops.txt', sep=',')
stopid = pd.read_csv(pth_files_GTFS + 'stop_times.txt', sep=',')   

periods = ['morning', 'm_peak', 'i_peak', 'a_peak', 'night']

MAPE_day = []
for line in selected_line:
#     line = selected_line[0]
    p, mp = stops_distance(line)
    pth_files_lines = "../historical/readData/"

    ### Operação demorada
    df, reps = read(pth_files_lines + "trips_" + line + ".dsk", pth_files_lines + "interps_" + line + ".rep")

    df['day'] = pd.to_datetime(df['day'], format = '%Y-%m-%d')
    df['start'] = pd.to_datetime(df['start'], format = '%H:%M:%S')
    
    start_date = datetime.date(2017, 9, 1)       
    end_date = datetime.date(2017, 9, 28)        

    df = df.loc[((df['day'] >= start_date) & (df['holiday'] != 1)) & ((df['weekday'] != 6) & (df['weekday'] != 5) & (df['weekday'] != 4) & (df['weekday'] != 0))]

#     end_date = datetime.date(2017, 9, 28)        
#     df = df.loc[((df['day'] >= end_date) & (df['holiday'] != 1)) & ((df['weekday'] != 6) & (df['weekday'] != 5) & (df['weekday'] != 4) & (df['weekday'] != 0))]

    df['exact_time'] = np.array(df['day'], dtype='datetime64[ns]') + \
                        pd.to_timedelta(pd.to_timedelta(pd.DatetimeIndex(df['start']).hour*60 + \
                        pd.DatetimeIndex(df['start']).minute + \
                        pd.DatetimeIndex(df['start']).second/60 + \
                        df['time'], unit='m'))

    df.index = df['exact_time']
    df['day_hour'] = df['exact_time'].apply(hr_func)

    df['link'] = 0
    for i in range(0, len(mp)):
        if (i == 0):
              df.loc[df.loc[(df['distance'] > 0) & (df['distance'] < mp[i+1]),]['link'].index,'link'] = i    
        if (i == len(mp) -1):
            df.loc[df.loc[(df['distance'] > mp[i]) & (df['distance'] < max(p)),]['link'].index,'link'] = i    
        if (i != 0) & (i != len(mp)-1):
            df.loc[df.loc[(df['distance'] >= mp[i]) & (df['distance'] < mp[i+1]),]['link'].index,'link'] = i    

    df = df.drop_duplicates(subset=['day', 'numtravel', 'link', 'day_hour'])   

    link_df = []
    travels = sorted(list(set(df.numtravel.unique())))
    for tr in travels:
        tck = reps[tr][0]
        tck_mods = [(tck[0],tck[1]-m,tck[2]) for m in mp]       
        tempo = [a[0] if a.size>0 else np.nan for a in [interpolate.sproot(tck_mod) for tck_mod in tck_mods]]
        row = [tuple([tempo[i+1]-pos for i, pos in enumerate(tempo[:-1])])]    
        link_df.append(row)      

    # data_training['time_link'] = 0 
    link_df_np = np.array(link_df)
    link_df_np.shape = (np.shape(link_df_np)[0] * np.shape(link_df_np)[2])

    # df['time_link'] = 0

    right = pd.DataFrame({'numtravel': np.repeat(df.numtravel.unique(), len(mp)-1), \
                  'link': np.tile(range(0, len(mp)-1), len(df.numtravel.unique())), 
                  'time_link': link_df_np}) 

    df_merged = pd.merge(right, df, how='right', on=['numtravel', 'link'])    
    
    df_merged.index = df_merged['exact_time']

    ML_data = df_merged.loc[df_merged['link'] < (np.max(df_merged['link']) - 1)]


    ML_data = ML_data.groupby([pd.Grouper(freq='30min'), 'link'], as_index=True ).median()['time_link'].unstack()\
        .rolling(1).mean().transform(lambda x: x.fillna(method='ffill')).dropna()

    ML_data = ML_data.iloc[ML_data.index.indexer_between_time('08:00', '23:00')].reset_index()

    ML_data_train = ML_data.loc[(ML_data['exact_time'] < end_date)]
    ML_data_test = ML_data.loc[(ML_data['exact_time'] >= end_date)]

    train_x = ML_data_train.iloc[:(np.shape(ML_data_train)[0] -1),:]
    train_y = ML_data_train.iloc[1:,:]


    test_x = ML_data_test.iloc[:(np.shape(ML_data_test)[0] -1),:]
    test_y = ML_data_test.iloc[1:,:]

    model = LinearRegression()
    model_tree = tree.DecisionTreeRegressor()
    model_lasso = Lasso(alpha=0.1)  
    
    model.fit(train_x.drop(['exact_time'], axis=1), train_y.drop(['exact_time'], axis=1))    
    pred_y_after = model.predict(test_x.drop(['exact_time'], axis=1))    
    
    model_tree.fit(train_x.drop(['exact_time'], axis=1), train_y.drop(['exact_time'], axis=1))    
    pred_y_after_tree = model_tree.predict(test_x.drop(['exact_time'], axis=1))    
    
    
    model_lasso.fit(train_x.drop(['exact_time'], axis=1), train_y.drop(['exact_time'], axis=1))    
    pred_y_after_lasso = model_lasso.predict(test_x.drop(['exact_time'], axis=1))    
    
    pred_y_after = pd.DataFrame(pred_y_after)
    pred_y_after.index = test_x['exact_time']
    
    pred_y_after_tree = pd.DataFrame(pred_y_after_tree)
    pred_y_after_tree.index = test_x['exact_time']    
    
    pred_y_after_lasso = pd.DataFrame(pred_y_after_lasso)
    pred_y_after_lasso.index = test_x['exact_time']    
    
    ### 2 times after data  
    
    train_x = ML_data_train.iloc[:(np.shape(ML_data_train)[0] - 2),:]
    train_y = ML_data_train.iloc[2:,:]


    test_x = ML_data_test.iloc[:(np.shape(ML_data_test)[0] - 2),:]
    test_y = ML_data_test.iloc[2:,:]
    
    
    model.fit(train_x.drop(['exact_time'], axis=1), train_y.drop(['exact_time'], axis=1))    
    pred_y_future = model.predict(test_x.drop(['exact_time'], axis=1))
    
    model_tree.fit(train_x.drop(['exact_time'], axis=1), train_y.drop(['exact_time'], axis=1))    
    pred_y_future_tree = model_tree.predict(test_x.drop(['exact_time'], axis=1))   
    
    model_lasso.fit(train_x.drop(['exact_time'], axis=1), train_y.drop(['exact_time'], axis=1))    
    pred_y_future_lasso = model_lasso.predict(test_x.drop(['exact_time'], axis=1))
     
    pred_y_future = pd.DataFrame(pred_y_future)
    pred_y_future.index = test_x['exact_time']   
    
    pred_y_future_tree = pd.DataFrame(pred_y_future_tree)
    pred_y_future_tree.index = test_x['exact_time']

    pred_y_future_lasso = pd.DataFrame(pred_y_future_lasso)
    pred_y_future_lasso.index = test_x['exact_time']
  
    ### per travel verification

    travel_times = pd.DataFrame(np.reshape(link_df, (np.shape(link_df)[0], np.shape(link_df)[2])))
    travel_times.index = travels

    MAPE_array = []
    for select_day in df.day.dt.strftime('%Y-%m-%d').unique():
        
        for link in range(1, len(mp)-3):
            df_temp = df.loc[(df['day'] == select_day) & (df['link'] == link)]
            # travels_day = np.unique(np.array(df_temp['numtravel']))

            for hour_day in range(640, 1200, 60):   
                int_part, float_part = divmod((hour_day/60), 1)

                hour_begin = str(int(int_part)) + ':' + '00'
                hour_end = str(int(int_part + 1 )) + ':' + '00'
                hour_test = str(int(int_part +  2)) + ':' + '00'
                hour_future = str(int(int_part +  3)) + ':' + '00'
                
                df_temp_time = df_temp.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]
                df_temp_test = df_temp.iloc[df_temp.index.indexer_between_time( hour_end, hour_test)]
                df_temp_future = df_temp.iloc[df_temp.index.indexer_between_time(hour_test, hour_future)]               
                
                temp_travels_time = travel_times.loc[np.unique(np.array(df_temp_time['numtravel']))][link]
                time_tests        = travel_times.loc[np.unique(np.array(df_temp_test['numtravel']))][link]
                time_future       = travel_times.loc[np.unique(np.array(df_temp_future['numtravel']))][link]
                
                median_test = np.median(temp_travels_time)
                
                median_LR_after = np.array(pred_y_after.iloc[pred_y_after.index.indexer_between_time(hour_test, hour_test)][link])
                median_LR_future = np.array(pred_y_future.iloc[pred_y_future.index.indexer_between_time(hour_future, hour_future)][link])
                
                median_Tree_after = np.array(pred_y_after_tree.iloc[pred_y_after_tree.index.indexer_between_time(hour_test, hour_test)][link])
                median_Tree_future = np.array(pred_y_future_tree.iloc[pred_y_future_tree.index.indexer_between_time(hour_future, hour_future)][link])
                
                median_Lasso_after = np.array(pred_y_after_lasso.iloc[pred_y_after_lasso.index.indexer_between_time(hour_test, hour_test)][link])
                median_Lasso_future = np.array(pred_y_future_lasso.iloc[pred_y_future_lasso.index.indexer_between_time(hour_future, hour_future)][link])
                
                if ((len(temp_travels_time) > 0) & (math.isnan(median_test) == False)):
                    MAPE_day.append([line, link, select_day, hour_begin + ' - ' + hour_end, len(temp_travels_time), median_test,
                         mean_absolute_percentage_error([median_test]*len(temp_travels_time), temp_travels_time), 
                         str(np.array(temp_travels_time)), 'Actual'])
         
                
                if (len(time_tests) > 0) & (math.isnan(median_test) == False) & (np.isnan(time_tests).any() == False):
                    MAPE_array.append(mean_absolute_percentage_error([median_test]*len(time_tests), time_tests))
                    
                    
                    MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(time_tests), median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_tests), time_tests), 
                                     str(np.array(time_tests)), 'After'])
                    
                    MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(hour_test), median_LR_after,
                             mean_absolute_percentage_error([median_LR_after]*len(time_tests), time_tests), 
                             str(np.array(time_tests)), 'median_LR_after'])
                    
                    MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(hour_test), median_Tree_after,
                             mean_absolute_percentage_error([median_Tree_after]*len(time_tests), time_tests), 
                             str(np.array(time_tests)), 'median_Tree_after'])
                    
                    MAPE_day.append([line, link, select_day, hour_end + ' - ' + hour_test, len(hour_test), median_Lasso_after,
                             mean_absolute_percentage_error([median_Lasso_after]*len(time_tests), time_tests), 
                             str(np.array(time_tests)), 'median_Lasso_after'])
                    
                if (len(time_future) > 0) & (math.isnan(median_test) == False) & (np.isnan(time_future).any() == False):
                    MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'Future'])
                    
                    MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_LR_after,
                                     mean_absolute_percentage_error([median_LR_after]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'median_LR_future'])
                    
                    MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_Tree_future,
                                     mean_absolute_percentage_error([median_Tree_future]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'median_Tree_future'])
                    
                    MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_Tree_future,
                                     mean_absolute_percentage_error([median_Tree_future]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'median_Tree_future'])
                    
                    MAPE_day.append([line, link, select_day, hour_test + ' - ' + hour_future, len(time_future), median_Lasso_future,
                                     mean_absolute_percentage_error([median_Lasso_future]*len(time_future), time_future), 
                                     str(np.array(time_future)), 'median_Lasso_future'])
    
                    
    
    print('Ready line: ' + line )


Ready line: 6450-10-0


In [13]:
data = pd.DataFrame(MAPE_day,columns=['line', 'link', 'day', 'time', 'N-travel', 'median', 'MAPE', 'travels_time', 'errors'])
pd.DataFrame(data).to_html('html/data-60M_median_'  + select_day + '.html')  
        
data = data.loc[data['MAPE'] != 0] 
# data.loc[(data['errors'] != 'median_LR_after') & (data['errors'] != 'median_Tree_after') &
#         (data['errors'] != 'median_LR_future') & (data['errors'] != 'median_Tree_future')]
fig = (
        p9.ggplot(data, p9.aes(x = 'errors', y ='MAPE', group='errors', colors='errors')) +
        p9.geom_boxplot( outlier_shape = "") +
        p9.scale_y_continuous(limits =  (0, 50)) +        
        p9.stat_boxplot(geom ='errorbar')  +        
        p9.theme_bw() +
        p9.theme(figure_size = (8, 12)) +
        p9.facet_wrap('~line', ncol = 3) +
        p9.labs(title = 'MAPE of the lines in the day ' + select_day ) +
        p9.theme(figure_size = (9, 12)) +
        p9.theme(axis_text_x = p9.element_text(angle=90)) 
    )
p9.ggsave(fig, 'png/Boxplot' + '_' + select_day + '_60M_median.png')  
p9.ggsave(fig, 'pdf/Boxplot' + '_' + select_day + '_60M_median.pdf')  

for i in data['errors'].unique():
    print(i + ' : ' + str(data.loc[data['errors'] == i]['MAPE'].mean()))

Actual : 10.60633630861167
After : 15.413266513636533
median_LR_after : 14.588977528393583
median_Tree_after : 21.578564789031116
median_Lasso_after : 14.467570324018032
Future : 18.319616488348224
median_LR_future : 14.335110099378415
median_Tree_future : 17.79166852908641
median_Lasso_future : 14.83445000947186


# Using the Mean of $n$ buses in a period of 30 minutes

In [348]:
selected_line = ['8700-10-1']

selected_line = ['8700-10-1',                
                '7545-10-1',
                '7545-10-0',
                '6450-10-1',
                '6450-10-0',
                '3301-10-1',
#                 '3301-10-0',
                '2290-10-1',
                '2290-10-0',
#                 '574J-10-1',
#                 '574J-10-0',
#                 '477P-10-1',
                '477P-10-0',
#                 '351F-10-1',
#                 '351F-10-0'
                ]

pth_files_GTFS = "../historical/readData/dados/gtfs/"

trips = pd.read_csv(pth_files_GTFS + 'trips.txt', sep=',')
shapes = pd.read_csv(pth_files_GTFS + 'shapes.txt', sep=',')
stops = pd.read_csv(pth_files_GTFS + 'stops.txt', sep=',')
stopid = pd.read_csv(pth_files_GTFS + 'stop_times.txt', sep=',')   

periods = ['morning', 'm_peak', 'i_peak', 'a_peak', 'night']

for line in selected_line:
#     line = selected_line[0]
    p, mp = stops_distance(line)
    pth_files_lines = "../historical/readData/"

    ### Operação demorada
    df, reps = read(pth_files_lines + "trips_" + line + ".dsk", pth_files_lines + "interps_" + line + ".rep")

    df['day'] = pd.to_datetime(df['day'], format = '%Y-%m-%d')
    df['start'] = pd.to_datetime(df['start'], format = '%H:%M:%S')

    end_date = datetime.date(2017, 9, 1)        

    df = df.loc[((df['day'] >= end_date) & (df['holiday'] != 1)) & ((df['weekday'] != 6) & (df['weekday'] != 5) & (df['weekday'] != 4) & (df['weekday'] != 0))]

    df['exact_time'] = np.array(df['day'], dtype='datetime64[ns]') + \
                        pd.to_timedelta(pd.to_timedelta(pd.DatetimeIndex(df['start']).hour*60 + \
                        pd.DatetimeIndex(df['start']).minute + \
                        pd.DatetimeIndex(df['start']).second/60 + \
                        df['time'], unit='m'))

    df.index = df['exact_time']
    df.iloc[df.index.indexer_between_time('08:00', '20:00')]

    df['day_hour'] = df['exact_time'].apply(hr_func)

    df['link'] = 0
    for i in range(0, len(mp)):
        if (i == 0):
              df.loc[df.loc[(df['distance'] > 0) & (df['distance'] < mp[i+1]),]['link'].index,'link'] = i    
        if (i == len(mp) -1):
            df.loc[df.loc[(df['distance'] > mp[i]) & (df['distance'] < max(p)),]['link'].index,'link'] = i    
        if (i != 0) & (i != len(mp)-1):
            df.loc[df.loc[(df['distance'] >= mp[i]) & (df['distance'] < mp[i+1]),]['link'].index,'link'] = i    

    df = df.drop_duplicates(subset=['day', 'numtravel', 'link', 'day_hour'])   

    link_df = []
    travels = sorted(list(set(df.numtravel.unique())))
    for tr in travels:
        tck = reps[tr][0]
        tck_mods = [(tck[0],tck[1]-m,tck[2]) for m in mp]       
        tempo = [a[0] if a.size>0 else np.nan for a in [interpolate.sproot(tck_mod) for tck_mod in tck_mods]]
        row = [tuple([tempo[i+1]-pos for i, pos in enumerate(tempo[:-1])])]    
        link_df.append(row)

    travel_times = pd.DataFrame(np.reshape(link_df, (np.shape(link_df)[0], np.shape(link_df)[2])))
    travel_times.index = travels

    MAPE_array = []

    for select_day in df.day.dt.strftime('%Y-%m-%d').unique():
        MAPE_day = []
        for link in range(1, len(mp)-2):
            df_temp = df.loc[(df['day'] == select_day) & (df['link'] == link)]
            # travels_day = np.unique(np.array(df_temp['numtravel']))

            for hour_day in range(640, 1320, 30):   
                int_part, float_part = divmod((hour_day/60), 1)

                if float_part != 0:
                    hour_begin = str(int(int_part)) + ':' + '30'
                    hour_end = str(int(int_part + 1 )) + ':' + '00'
                    hour_test = str(int(int_part + 1 )) + ':' + '30'
                else:
                    hour_begin = str(int(int_part)) + ':' + '00'
                    hour_end = str(int(int_part)) + ':' + '30'
                    hour_test = str(int(int_part + 1 )) + ':' + '00'

                df_temp_time = df_temp.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]
                df_temp_test = df_temp.iloc[df_temp.index.indexer_between_time( hour_end, hour_test)]
                
                temp_travels_time = travel_times.loc[np.unique(np.array(df_temp_time['numtravel']))][link]
                
                median_test = np.mean(temp_travels_time)
                time_tests = travel_times.loc[np.unique(np.array(df_temp_test['numtravel']))][link]
                
                if ((len(temp_travels_time) > 0) & (math.isnan(median_test) == False)):
                    MAPE_day.append([link, float(format(hour_day/60, '.1f')), len(temp_travels_time), median_test,
                         mean_absolute_percentage_error([median_test]*len(temp_travels_time), temp_travels_time), 
                         str(np.array(temp_travels_time)), 'actual'])
#                     print(str(link) + "-" + str(hour_day) + ":" + str(mean_absolute_percentage_error([median_test]*len(time_tests), time_tests)) + "\n")
                
         
                
                if (len(time_tests) > 0) & (math.isnan(median_test) == False) & (np.isnan(time_tests).any() == False):
                    MAPE_array.append(mean_absolute_percentage_error([median_test]*len(time_tests), time_tests))
                    
                    
                    MAPE_day.append([link, float(format(hour_day/60, '.1f')), len(temp_travels_time), median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_tests), time_tests), 
                                     str(np.array(time_tests)), 'after'])



            data = pd.DataFrame(MAPE_day,columns=['link', 'time', 'N-travel', 'median', 'MAPE', 'travels_time', 'errors'])
        pd.DataFrame(data).to_html('html/data-30M_mean_' + line + '_' + select_day + '.html')  
        
        if not data.empty:        
            fig = (
                p9.ggplot(data) +
                p9.geom_line(
                    p9.aes(x = 'time', y ='MAPE', group='errors', color='errors')
                ) +
                p9.theme_bw() +
                p9.theme(figure_size = (8, 12)) +
                p9.facet_wrap('~link', ncol = 6) +
                p9.labs(
                    title = 'MAPE of the line '  + line + ' in the day ' + select_day 
                ) +
                p9.theme(figure_size = (18, 24)) 
            )
            p9.ggsave(fig, 'png/' + line + '_' + select_day + '_30M_mean.png')

            fig = (
                p9.ggplot(data) +
                p9.geom_line(
                    p9.aes(x = 'time', y ='MAPE', group='errors', color='errors')
                ) +
                p9.theme_bw() +
                p9.theme(figure_size = (8, 12)) +
                p9.facet_wrap('~link', ncol = 6, scales = "free_y") +
                p9.labs(
                    title = 'MAPE of the line '  + line + ' in the day ' + select_day 
                ) +
                p9.theme(figure_size = (18, 24)) 
            )
            p9.ggsave(fig, 'png/' + line + '_' + select_day + '_30M_mean_free-Y.png') 
            
    MAPE_array = np.array(MAPE_array)
    print(line + ' MAPE(all) = ' + str(format(np.mean(MAPE_array), '.2f')) + '%')

    median_fig, ax = plt.subplots()
    ax.set_title(line + ' - mean = ' + str(format(np.mean(MAPE_array), '.2f')) + '%')
    ax.boxplot(MAPE_array, notch=True)
    median_fig.savefig(line + '_mean_last_30MIN_all.png' )


    MAPE_array = MAPE_array[MAPE_array < 50]
    print(line + ' MAPE(Thresh>5) = ' + str(format(np.mean(MAPE_array), '.2f')) + '%')
    

    median_fig, ax = plt.subplots()
    ax.set_title(line + ' - mean = ' + str(format(np.mean(MAPE_array), '.2f')) + '%')
    ax.boxplot(MAPE_array, notch=True)
    median_fig.savefig(line + '_mean_last_30MIN_MAPE-threshold.png' )




In [None]:
selected_line = ['8700-10-1']

'8700-10-0',
selected_line = ['8700-10-1',                
                '7545-10-1',
                '7545-10-0',
                '6450-10-1',
                '6450-10-0',
                '3301-10-1',
#                 '3301-10-0',
                '2290-10-1',
                '2290-10-0',
#                 '574J-10-1',
#                 '574J-10-0',
#                 '477P-10-1',
                '477P-10-0',
#                 '351F-10-1',
#                 '351F-10-0'
                ]


pth_files_GTFS = "../historical/readData/dados/gtfs/"

trips = pd.read_csv(pth_files_GTFS + 'trips.txt', sep=',')
shapes = pd.read_csv(pth_files_GTFS + 'shapes.txt', sep=',')
stops = pd.read_csv(pth_files_GTFS + 'stops.txt', sep=',')
stopid = pd.read_csv(pth_files_GTFS + 'stop_times.txt', sep=',')   

periods = ['morning', 'm_peak', 'i_peak', 'a_peak', 'night']

for line in selected_line:
#     line = selected_line[0]
    p, mp = stops_distance(line)
    pth_files_lines = "../historical/readData/"

    ### Operação demorada
    df, reps = read(pth_files_lines + "trips_" + line + ".dsk", pth_files_lines + "interps_" + line + ".rep")

    df['day'] = pd.to_datetime(df['day'], format = '%Y-%m-%d')
    df['start'] = pd.to_datetime(df['start'], format = '%H:%M:%S')

    end_date = datetime.date(2017, 9, 1)        

    df = df.loc[((df['day'] >= end_date) & (df['holiday'] != 1)) & ((df['weekday'] != 6) & (df['weekday'] != 5) & (df['weekday'] != 4) & (df['weekday'] != 0))]

    df['exact_time'] = np.array(df['day'], dtype='datetime64[ns]') + \
                        pd.to_timedelta(pd.to_timedelta(pd.DatetimeIndex(df['start']).hour*60 + \
                        pd.DatetimeIndex(df['start']).minute + \
                        pd.DatetimeIndex(df['start']).second/60 + \
                        df['time'], unit='m'))

    df.index = df['exact_time']
    df.iloc[df.index.indexer_between_time('08:00', '20:00')]

    df['day_hour'] = df['exact_time'].apply(hr_func)

    df['link'] = 0
    for i in range(0, len(mp)):
        if (i == 0):
              df.loc[df.loc[(df['distance'] > 0) & (df['distance'] < mp[i+1]),]['link'].index,'link'] = i    
        if (i == len(mp) -1):
            df.loc[df.loc[(df['distance'] > mp[i]) & (df['distance'] < max(p)),]['link'].index,'link'] = i    
        if (i != 0) & (i != len(mp)-1):
            df.loc[df.loc[(df['distance'] >= mp[i]) & (df['distance'] < mp[i+1]),]['link'].index,'link'] = i    

    df = df.drop_duplicates(subset=['day', 'numtravel', 'link', 'day_hour'])   

    link_df = []
    travels = sorted(list(set(df.numtravel.unique())))
    for tr in travels:
        tck = reps[tr][0]
        tck_mods = [(tck[0],tck[1]-m,tck[2]) for m in mp]       
        tempo = [a[0] if a.size>0 else np.nan for a in [interpolate.sproot(tck_mod) for tck_mod in tck_mods]]
        row = [tuple([tempo[i+1]-pos for i, pos in enumerate(tempo[:-1])])]    
        link_df.append(row)

    travel_times = pd.DataFrame(np.reshape(link_df, (np.shape(link_df)[0], np.shape(link_df)[2])))
    travel_times.index = travels

    MAPE_array = []

    for select_day in df.day.dt.strftime('%Y-%m-%d').unique():
        MAPE_day = []
        for link in range(1, len(mp)-2):
            df_temp = df.loc[(df['day'] == select_day) & (df['link'] == link)]
            # travels_day = np.unique(np.array(df_temp['numtravel']))

            for hour_day in range(640, 1320, 60):   
                int_part, float_part = divmod((hour_day/60), 1)

                hour_begin = str(int(int_part)) + ':' + '00'
                hour_end = str(int(int_part + 1 )) + ':' + '00'
                hour_test = str(int(int_part +  2)) + ':' + '00'
                
                df_temp_time = df_temp.iloc[df_temp.index.indexer_between_time( hour_begin, hour_end)]
                df_temp_test = df_temp.iloc[df_temp.index.indexer_between_time( hour_end, hour_test)]
                
                temp_travels_time = travel_times.loc[np.unique(np.array(df_temp_time['numtravel']))][link]
                
                median_test = np.mean(temp_travels_time)
                time_tests = travel_times.loc[np.unique(np.array(df_temp_test['numtravel']))][link]
                
                if ((len(temp_travels_time) > 0) & (math.isnan(median_test) == False)):
                    MAPE_day.append([link, float(format(hour_day/60, '.1f')), len(temp_travels_time), median_test,
                         mean_absolute_percentage_error([median_test]*len(temp_travels_time), temp_travels_time), 
                         str(np.array(temp_travels_time)), 'actual'])
#                     print(str(link) + "-" + str(hour_day) + ":" + str(mean_absolute_percentage_error([median_test]*len(time_tests), time_tests)) + "\n")
                
         
                
                if (len(time_tests) > 0) & (math.isnan(median_test) == False) & (np.isnan(time_tests).any() == False):
                    MAPE_array.append(mean_absolute_percentage_error([median_test]*len(time_tests), time_tests))
                    
                    
                    MAPE_day.append([link, float(format(hour_day/60, '.1f')), len(temp_travels_time), median_test,
                                     mean_absolute_percentage_error([median_test]*len(time_tests), time_tests), 
                                     str(np.array(time_tests)), 'after'])
            
            
            data = pd.DataFrame(MAPE_day,columns=['link', 'time', 'N-travel', 'median', 'MAPE', 'travels_time', 'errors'])
        pd.DataFrame(data).to_html('html/data-60M_mean_' + line + '_' + select_day + '.html')  
        
        if not data.empty:        
            fig = (
                p9.ggplot(data) +
                p9.geom_line(
                    p9.aes(x = 'time', y ='MAPE', group='errors', color='errors')
                ) +
                p9.theme_bw() +
                p9.theme(figure_size = (8, 12)) +
                p9.facet_wrap('~link', ncol = 6) +
                p9.labs(
                    title = 'MAPE of the line '  + line + ' in the day ' + select_day 
                ) +
                p9.theme(figure_size = (18, 24)) 
            )
            p9.ggsave(fig, 'png/' + line + '_' + select_day + '_60M_mean.png')

            fig = (
                p9.ggplot(data) +
                p9.geom_line(
                    p9.aes(x = 'time', y ='MAPE', group='errors', color='errors')
                ) +
                p9.theme_bw() +
                p9.theme(figure_size = (8, 12)) +
                p9.facet_wrap('~link', ncol = 6, scales = "free_y") +
                p9.labs(
                    title = 'MAPE of the line '  + line + ' in the day ' + select_day 
                ) +
                p9.theme(figure_size = (18, 24)) 
            )
            p9.ggsave(fig, 'png/' + line + '_' + select_day + '_60M_mean_free-Y.png')          

    MAPE_array = np.array(MAPE_array)
    print(line + ' MAPE(all) = ' + str(format(np.mean(MAPE_array), '.2f')) + '%')

    median_fig, ax = plt.subplots()
    ax.set_title(line + ' - mean = ' + str(format(np.mean(MAPE_array), '.2f')) + '%')
    ax.boxplot(MAPE_array, notch=True)
    median_fig.savefig(line + '_mean_last_60MIN_all.png' )


    MAPE_array = MAPE_array[MAPE_array < 50]
    print(line + ' MAPE(Thresh>5) = ' + str(format(np.mean(MAPE_array), '.2f')) + '%')
    

    median_fig, ax = plt.subplots()
    ax.set_title(line + ' - mean = ' + str(format(np.mean(MAPE_array), '.2f')) + '%')
    ax.boxplot(MAPE_array, notch=True)
    median_fig.savefig(line + '_mean_last_60MIN_MAPE-threshold.png' )
