In [170]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_file
from bokeh.models import Band
from bokeh.models.annotations import Title

In [51]:
def plot_lines(data_file, output_file, depth_type, start_date, end_date, num_outliers, use_top50=False):
    data = pd.read_csv(data_file)
    data = data.set_index(pd.date_range(start=start_date, end=end_date).date)
    data = data.transpose()
    data = data.set_index(pd.date_range(start=start_date, periods=24, freq='H').time)

    data_outputs = pd.read_csv(output_file).set_index(pd.date_range(start=start_date, end=end_date).date)

    data_outputs = data_outputs.transpose()

    data_final = pd.concat([data,data_outputs])

    data_final = data_final.sort_values(by=depth_type,axis=1)
    
    data_outliers = data_final.iloc[:,:num_outliers]
    data_median   = data_final.iloc[:,data_final.shape[1]-1]
    if use_top50:
        data_top50    = data_final.iloc[:,(data_final.shape[1]//2):data_final.shape[1]-1]
        data_ordered  = pd.concat([data_top50, data_outliers, data_median], axis=1)
    else:
        data_middle   = data_final.iloc[:,num_outliers:data_final.shape[1]-1]
        data_ordered  = pd.concat([data_middle, data_outliers, data_median], axis=1)
    pcrint(data_ordered.shape)
    
    color_top50 = ["#fbb4b9"] * (data_ordered.shape[1] - (num_outliers+1))
    color_median = ["Blue"]
    color_outliers = ["Red"] * num_outliers
    color_list = color_top50 + color_outliers + color_median
    
    alpha_list = [0.5] * data_final.shape[1]
    alpha_list[0:num_outliers] = [1] * num_outliers
    alpha_list[len(alpha_list)-1] = 1

    numlines = len(data_ordered.columns)

    p = figure(width=1000, height=800, x_axis_type="datetime") 
    p.multi_line(xs=[data_ordered.index.values]*numlines,
                 ys=[data_ordered[name].values for name in data_ordered.iloc[0:24]],
                 line_color=color_list,
                 line_width=5)
    return p

In [52]:
output_file('taxis_v3_top50.html')
plot_taxis_v3 = plot_lines('../data/taxis_v3.csv',"../outputs/taxis_v3_out.txt",'tmd','1/1/2014','12/31/2018', 20, use_top50=True)
show(plot_taxis_v3)

(36, 933)


In [53]:
output_file('taxis_v3_full.html')
plot_taxis_v3 = plot_lines('../data/taxis_v3.csv',"../outputs/taxis_v3_out.txt",'tmd','1/1/2014','12/31/2018', 20)
show(plot_taxis_v3)

(36, 1826)


In [54]:
output_file('taxis_v2_top50.html')
plot_taxis_v2 = plot_lines('../data/taxis_v2.csv',"../outputs/taxis_v2_out.txt",'tmd','1/1/2016','12/31/2018', 20, use_top50=True)
show(plot_taxis_v2)

(36, 568)


In [55]:
output_file('taxis_v2_full.html')
plot_taxis_v2 = plot_lines('../data/taxis_v2.csv',"../outputs/taxis_v2_out.txt",'tmd','1/1/2016','12/31/2018', 20)
show(plot_taxis_v2)

(36, 1096)


In [56]:
output_file('taxis_v1_top50.html')
plot_taxis_v1 = plot_lines('../data/taxis_v1.csv',"../outputs/taxis_v1_out.txt",'tmd','1/1/2018','12/31/2018', 20, use_top50=True)
show(plot_taxis_v1)

(36, 203)


In [57]:
output_file('taxis_v1_full.html')
plot_taxis_v1 = plot_lines('../data/taxis_v1.csv',"../outputs/taxis_v1_out.txt",'od','1/1/2018','12/31/2018', 20)
show(plot_taxis_v1)

(36, 365)


In [58]:
output_file('airquality_v1_top50.html')
plot_airquality_v1 = plot_lines('../data/airquality_v1.csv',"../outputs/airquality_v1_out.txt",'fd','03/10/2004','04/04/2005',20, use_top50=True)
show(plot_airquality_v1)

(36, 216)


In [59]:
output_file('airquality_v1_full.html')
plot_airquality_v1 = plot_lines('../data/airquality_v1.csv',"../outputs/airquality_v1_out.txt",'fd','03/10/2004','04/04/2005',20)
show(plot_airquality_v1)

(36, 391)


In [35]:
data_outputs['td'].quantile([0.25,0.5,0.75])

NameError: name 'data_outputs' is not defined

# Functional Boxplot

In [146]:
def prepare_data(data_file, output_file, start_date, end_date, index_days=None):
    data = pd.read_csv(data_file)
    if index_days:
        print(a)
    else:
        data = data.set_index(pd.date_range(start=start_date, end=end_date).date)
    data = data.transpose()
    data = data.set_index(pd.date_range(start=start_date, periods=24, freq='H').time)

    data_outputs = pd.read_csv(output_file).set_index(pd.date_range(start=start_date, end=end_date).date)
    data_outputs = data_outputs.transpose()

    data_final   = pd.concat([data,data_outputs])
    return data_final

def get_envelopes(data_top50):
    df_max = data_top50.max(axis=1)
    df_min = data_top50.min(axis=1)
    
    iqr = df_max - df_min
    mid = (df_max + df_min)//2
    out_top = mid + (0.75*iqr)
    out_bot = mid - (0.75*iqr)
    
    return pd.concat({'top':df_max,'bot':df_min,'out_top':out_top,'out_bot':out_bot},axis=1)

def get_outliers(raw_outliers, data_envelopes):
    data_outliers = pd.DataFrame()
    for col in raw_outliers.columns:
        if (raw_outliers[col][:24] > data_envelopes['out_top'][:24]).any() or (raw_outliers[col][:24] < data_envelopes['out_bot'][:24]).any():
            data_outliers = pd.concat([data_outliers,raw_outliers[col]],axis=1)
    return data_outliers

def functional_boxplot(data_file, output_file, start_date, end_date, depth_type):

    data_final   = prepare_data(data_file, output_file, start_date, end_date)
    data_final   = data_final.sort_values(by=depth_type,axis=1)
    
    data_median    = data_final.iloc[:,data_final.shape[1]-1]
    data_top50     = data_final.iloc[:,(data_final.shape[1]//2):data_final.shape[1]-1]
    data_envelopes = get_envelopes(data_top50)
    raw_outliers   = data_final.iloc[:,:(data_final.shape[1]//2)]
    data_outliers  = get_outliers(raw_outliers,data_envelopes)
    print(raw_outliers.shape)
    print(data_outliers.shape)
    data_ordered   = pd.concat([data_envelopes,data_outliers,data_median],axis=1)

    color_list = ["#f768a1","black","black","#f768a1"]+(["red"]*data_outliers.shape[1])+["blue"]
    alpha_list = [1.0]*4 + ([0.3]*data_outliers.shape[1]) + [1.0]
    
    numlines = len(data_ordered.columns)

    p = figure(width=1000, height=800, x_axis_type="datetime") 
    
    upper_band = np.array(data_envelopes['top'].iloc[0:24])
    lower_band = np.array(data_envelopes['bot'].iloc[0:24])
    x          = np.array(data_ordered.index[0:24])
    
    xs = np.concatenate([x, x[::-1]])
    ys = np.concatenate([lower_band, upper_band[::-1]])
    
    p.patch(x=xs, y=ys, fill_color="#f768a1", fill_alpha=0.8, line_alpha=0, legend="IQR")
    
    p.multi_line(xs=[data_ordered.index.values]*numlines,
                 ys=[data_ordered[name].values for name in data_ordered.iloc[0:24]],
                 line_color=color_list,
                 line_width=5,
                 alpha=alpha_list)

    return p

## Plot 1 year of taxis data

In [158]:
output_file('fbplot_taxisv1.html')
fbplot_taxisv1 = functional_boxplot('../data/taxis_v1.csv',"../outputs/taxis_v1_out.txt",'01/01/2018','12/31/2018','tmd')
show(fbplot_taxisv1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




(36, 182)
(36, 58)


## Plot 1 year of air quality data

In [152]:
output_file('fbplot_airquality_v1.html')
fbplot_taxisv1 = functional_boxplot('../data/airquality_v1.csv',"../outputs/airquality_v1_out.txt",'03/10/2004','04/04/2005','fd')
show(fbplot_taxisv1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




(36, 195)
(36, 100)


# Split datasets into weekdays and weekends

In [137]:
def split_datasets(data_file, output_file, start_date, end_date):

    data_aux = prepare_data(data_file, output_file, start_date, end_date)

    aux = []
    for col in data_aux.columns:
        aux.append(col.isoweekday())

    data_aux = data_aux.transpose()
    data_aux["weekday"] = aux
    data_weekends = data_aux.loc[data_aux["weekday"] > 5].drop("weekday",axis=1).transpose()
    data_weekdays = data_aux.loc[data_aux["weekday"] < 6].drop("weekday",axis=1).transpose()
    return data_weekdays, data_weekends

data_weekdays_taxi, data_weekends_taxi = split_datasets('../data/taxis_v1.csv',"../outputs/taxis_v1_out.txt",'01/01/2018','12/31/2018')
data_weekdays_airquality, data_weekends_airquality = split_datasets('../data/airquality_v1.csv',"../outputs/airquality_v1_out.txt",'03/10/2004','04/04/2005')

print(data_weekdays_taxi.shape, data_weekends_taxi.shape)
print(data_weekdays_airquality.shape, data_weekends_airquality.shape)

(36, 261) (36, 104)
(36, 279) (36, 112)


In [160]:
def functional_boxplot_from_df(data, depth_type):
    
    data_final   = data.sort_values(by=depth_type,axis=1)
    
    data_median    = data_final.iloc[:,data_final.shape[1]-1]
    data_top50     = data_final.iloc[:,(data_final.shape[1]//2):data_final.shape[1]-1]
    data_envelopes = get_envelopes(data_top50)
    raw_outliers   = data_final.iloc[:,:(data_final.shape[1]//2)]
    data_outliers  = get_outliers(raw_outliers,data_envelopes)
    print(raw_outliers.shape)
    print(data_outliers.shape)
    data_ordered   = pd.concat([data_envelopes,data_outliers,data_median],axis=1)

    color_list = ["#f768a1","black","black","#f768a1"]+(["red"]*data_outliers.shape[1])+["blue"]
    alpha_list = [1.0]*4 + ([0.3]*data_outliers.shape[1]) + [1.0]
    
    numlines = len(data_ordered.columns)

    p = figure(width=1000, height=800, x_axis_type="datetime") 
    
    upper_band = np.array(data_envelopes['top'].iloc[0:24])
    lower_band = np.array(data_envelopes['bot'].iloc[0:24])
    x          = np.array(data_ordered.index[0:24])
    
    xs = np.concatenate([x, x[::-1]])
    ys = np.concatenate([lower_band, upper_band[::-1]])
    
    p.patch(x=xs, y=ys, fill_color="#f768a1", fill_alpha=0.8, line_alpha=0, legend="IQR")
    
    p.multi_line(xs=[data_ordered.index.values]*numlines,
                 ys=[data_ordered[name].values for name in data_ordered.iloc[0:24]],
                 line_color=color_list,
                 line_width=5,
                 alpha=alpha_list)

    return p

In [173]:
fbplot_taxisv1_weekend = functional_boxplot_from_df(data_weekends_taxi,'od')
fbplot_taxisv1_weekday = functional_boxplot_from_df(data_weekdays_taxi,'od')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




(36, 52)
(36, 25)
(36, 130)
(36, 10)


In [174]:
show(fbplot_taxisv1_weekday)
show(fbplot_taxisv1_weekend)

In [175]:
fbplot_aqv1_weekend = functional_boxplot_from_df(data_weekends_airquality,'od')
fbplot_aqv1_weekday = functional_boxplot_from_df(data_weekdays_airquality,'od')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




(36, 56)
(36, 13)
(36, 139)
(36, 64)


In [176]:
show(fbplot_aqv1_weekday)
show(fbplot_aqv1_weekend)

# Tests and sandbox

In [None]:
def get_color_from_depth(depth):
    '''
    if depth < 0.270425: return '#fef0d9'
    if depth < 0.353502: return '#fdcc8a'
    '''
    if depth < 0.074495: return '#fef0d9'
    else: return '#fdcc8a' 


data_outputs['color'] = data_outputs['td'].map(lambda x: get_color_from_depth(x))

In [9]:
p = figure(x_axis_type="datetime", plot_width=950, title = "Band plot")
#p.xaxis.major_label_orientation = pi/4
p.xaxis.axis_label = "X"
p.xaxis.axis_label_text_font_style='normal'
p.yaxis.axis_label = "Y"
p.yaxis.axis_label_text_font_style='normal'
p.grid.grid_line_alpha=0.3

# Values to be plotted
upper_band = np.array([30,  32, 34, 35, 35, 33, 32, 31, 30])
lower_band = np.array([25,  27, 30, 31, 32, 30, 30, 29, 28])
x          = np.array([1,   2,  3,  4,  5,  6,  7,  8,  9])

# Bands are drawn as patches. That is, a polygon specified by a series of 2D points
# Because points are specified in clockwise order, the lower band needs to be reverse (Hence the [::-1])
xs = np.concatenate([x, x[::-1]])
ys = np.concatenate([upper_band, lower_band[::-1]])

# Draw the area patch without border
p.patch(x=xs, y=ys, fill_alpha=0.3, line_alpha=0, legend="Band")

# Draw the respective lines
p.line(x, upper_band, line_alpha=0.8)
p.line(x, lower_band, line_alpha=0.8)

show(p)

In [25]:
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
df['MAX'] = np.ones(10)
df['MIN'] = [0.5] * 10
df

Unnamed: 0,A,B,C,D,MAX,MIN
0,0.444939,0.407554,0.460148,0.465239,1.0,0.5
1,0.462691,0.016545,0.850445,0.817744,1.0,0.5
2,0.777962,0.757983,0.934829,0.831104,1.0,0.5
3,0.879891,0.926879,0.721535,0.117642,1.0,0.5
4,0.145906,0.199844,0.437564,0.100702,1.0,0.5
5,0.278735,0.609862,0.085823,0.836997,1.0,0.5
6,0.739635,0.866059,0.691271,0.377185,1.0,0.5
7,0.225146,0.43528,0.7009,0.700946,1.0,0.5
8,0.796487,0.018688,0.700566,0.900749,1.0,0.5
9,0.764869,0.2532,0.548054,0.778883,1.0,0.5


In [36]:
aux = pd.DataFrame()
df['B'] = ([0.8]*10)
df['A'] = 
for col in df.columns:
    if col != 'MAX' and col != 'MIN':
        if (df[col] > df['MAX']).any() or (df[col] < df['MIN']).any(): 
            aux = pd.concat([aux,df[col]],axis=1)
aux

Unnamed: 0,A,C,D
0,0.444939,0.460148,0.465239
1,0.462691,0.850445,0.817744
2,0.777962,0.934829,0.831104
3,0.879891,0.721535,0.117642
4,0.145906,0.437564,0.100702
5,0.278735,0.085823,0.836997
6,0.739635,0.691271,0.377185
7,0.225146,0.7009,0.700946
8,0.796487,0.700566,0.900749
9,0.764869,0.548054,0.778883


In [100]:
days = pd.date_range(start='01/01/2018', end='12/31/2018')
weekdays = 
for day in days

DatetimeIndex(['2018-01-01', '2018-04-02', '2018-07-02', '2018-10-01',
               '2018-12-31'],
              dtype='datetime64[ns]', freq=None)