In [1]:
import pandas as pd
import numpy as np
from src.funct import sum_and_round, mean_and_round, plot_boxplot_with_points


import matplotlib.pyplot as plt
import seaborn as sns

import altair as alt

## Basic Boxplot plot

Step by step

- load data
- filter data that we will use in the chapter
- build the boxplot_data containing information about `sales`, `quantity`, `discount` and `profit` as attributes
- build the basic box plot - without and with numbers to illustrate what the `altair` boxplot provides as informaton
- using the `sales` to demonstrate how to calculate median and quartiles


In [2]:
data = pd.read_excel('./data/superstore_data.xlsx')
data = data[['Region', 'Sub-Category', 'Sales', 'Quantity', 'Discount', 'Profit']]

boxplot_data = (data
 .groupby(['Sub-Category'])
 .agg(
     sales = ('Sales', sum_and_round),
     quantity = ('Quantity', 'sum'),
     discount = ('Discount', mean_and_round),
     profit = ('Profit', sum_and_round)
 )
).reset_index()

boxplot_data['type'] = 'Sub-Category'
boxplot_data.head()

Unnamed: 0,Sub-Category,sales,quantity,discount,profit,type
0,Accessories,167380.0,2976,0.0785,41937.0,Sub-Category
1,Appliances,107532.0,1729,0.1665,18138.0,Sub-Category
2,Art,27119.0,3000,0.0749,6528.0,Sub-Category
3,Binders,203413.0,5974,0.3723,30222.0,Sub-Category
4,Bookcases,114880.0,868,0.2111,-3473.0,Sub-Category


## Boxplot without points

In [3]:
measure = 'sales'
measure_encode = measure + ':Q'

box = (alt
       .Chart(boxplot_data)
       .mark_boxplot(size = 25, opacity = 0.8, color = '#76B7B2')
       .encode(
           x = alt.X(measure),
           y = alt.Y('type:O', title='')
       )
      )
       

chart = (box).properties(
    title = measure + ' by Sub-category box plot',
    width = 620,
    height = 80
)

chart

In [4]:
sales = boxplot_data[['Sub-Category', 'sales', 'type']]

## Boxplot with points

In [5]:
points = (alt
          .Chart(sales)
          .mark_point(size = 50, filled=True, stroke='black',
                      strokeWidth=1,color = '#9C755F')
          .encode(
              x = alt.X('sales:Q'),
              tooltip = [
                  alt.Tooltip("Sub-Category:N", title = "Category"),
                  alt.Tooltip("sales:Q", title = 'Sales', format =",.0f")
              ]
          )
         )

box = (alt
       .Chart(sales)
       .mark_boxplot(size = 25, opacity = 0.8, color = '#76B7B2')
       .encode(
           x = alt.X('sales:Q'),
       )
      )
       

chart = (box + points).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 80
)

chart

## Median

Let's start from the definition of the median of a dataset

> The ***median*** of a data set is its middle number when sorted in an ascending or descending order. It's the point above and below which $50\%$ of the observed data falls so it represents the midpoint of the data. 

The median is also referred to as the $1/2$ quantile, a cutoff point dividing the dataset into equal halves. For discrete datasets, there may be one or more points that could serve as the median. For example, consider the sum of sales for each sub-category in the Super Store dataset, which contains an odd number of elements.

If we have an odd number of elements, sorting them in ascending or descending order allows us to pick the middle element as the median. This is because exactly $50/%$ of the elements will be less than this middle value, and $50/%$ will be greater. The middle point itself, or median, serves as the dividing line indicating the dataset's midpoint without being part of either half.

In this example, as we have 17 data points, we first sort them in ascending order and then select its  9<sup>th</sup> element as the median of the dataset.

---

Previous version:
For discrete sets of data point, we can have one median or multiple median points. Consider the sum of sales for each Sub-Category of the Super Store dataset. This dataset contains an odd number of elements. If we have an odd number of elements and we sort it in ascending or descending order then its middle element is the median of the set, as exactly the $50/%$ of its elements will be less this number and $50/%$ of the data will be greater. The middle point itself, i.e. the median, in this case is not properly contained in one of the halves, but represent the cut point indicating the middle of the set.



In [6]:
# in Python, indexing starts at 0
middle_index = int(sales.shape[0]/2)
print("Median of the Dataset:\n{}".format(sales.sort_values('sales').iloc[middle_index]))

Median of the Dataset:
Sub-Category       Bookcases
sales               114880.0
type            Sub-Category
Name: 4, dtype: object


The following plot highlights the middle point (the median), and in different color, the elements that are smaller and greater than it.

In [7]:
from altair import datum #Needed for subsetting (transforming data)

median = sales.sort_values('sales').iloc[middle_index]['sales']

points = (alt
          .Chart(sales)
          .mark_point(size = 50, 
                      filled=True,
                      stroke='black',
                      strokeWidth=1)
          .encode(
              x = alt.X('sales:Q'),
              color = alt.condition(
                  alt.datum.sales > median,
                  alt.value("steelblue"),
                  alt.value("orange")
              ),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

middle_point = (alt
          .Chart(sales[sales['sales'] == median])
          .mark_point(size = 50, 
                      filled=True,
                      stroke='black',
                      strokeWidth=1)
          .encode(
              x = alt.X('sales:Q'),
              color = alt.value("#484847"),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

vertical_line_middle_point = (alt
                 .Chart(pd.DataFrame({'x': [median]}))
                 .mark_rule(
                     color = 'black',
                     strokeWidth = 0.7
                 )
                 .encode(
                     x = 'x:Q',
                     y = alt.value(27),
                     y2 = alt.value(53)
                 )
                )


box = (alt
       .Chart(sales)
       .mark_boxplot(size = 25, 
                     opacity = 0.8, 
                     color = '#76B7B2')
       .encode(
           x = alt.X('sales:Q'),
#           y = 'Region'
       )
      )
       
annotation = (alt
              .Chart(sales)
              .mark_text(
                  align='center',
                  baseline='top',
                  fontSize = 12,
                  dx = 0,
                  dy = -30
              ).encode(
                  x='sales',
                  text=alt.Text('sales', format='$,.0f')
              ).transform_filter(
                  (datum.sales == median)
              )
             )

chart = (box   + points + middle_point + vertical_line_middle_point + annotation).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 80
)

chart

What happens when we have an even number of data points? In this case, we can exactly divide the dataset in two subsets with the same number of element each ($50/%$) and so the median is not a datapoint of the dataset but, more formally, the interval of all the values between the upper bound of the half lowest points and the lower point of the second half. To visualize this concept, let's remove the median point from our sales dataset and consider the remaining 16 elements, resulting in an even-sized dataset.

In [8]:
sales_even = (sales
              .sort_values('sales')
              .reset_index(drop = True)
              .drop(int(sales.shape[0]/2))
              .reset_index(drop = True)
             )

sales_even.shape[0]

16

In [9]:
from altair import datum #Needed for subsetting (transforming data)
first_half = sales_even.iloc[int(sales_even.shape[0]/2)-1]['sales']
second_half = sales_even.iloc[int(sales_even.shape[0]/2)]['sales']

box = (alt
       .Chart(sales_even)
       .mark_boxplot(size = 25,
                    opacity=0.9,
                     color='silver')
       .encode(
           x = alt.X('sales:Q'),
#           y = 'Region'
       )
      )

points = (alt
          .Chart(sales_even)
          .mark_point(size = 50, 
                      filled=True, 
                      stroke='black',
                      strokeWidth=1
                     )
          .encode(
              x = alt.X('sales:Q'),
              color = alt.condition(
                  alt.datum.sales > first_half,
                  alt.value("steelblue"),
                  alt.value("orange")
              ),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

annotation_left = (alt
              .Chart(sales_even.iloc[int(sales_even.shape[0]/2)-1].to_frame().transpose())
              .mark_text(
                  align='center',
                  baseline='top',
                  color = '#aa4905',
                  fontSize = 12,
                  dx = 0,
                  dy = 18
              ).encode(
                  x='sales',
                  text=alt.Text('sales', format='$,.0f')
              )
             )


annotation_right = (alt
              .Chart(sales_even.iloc[int(sales_even.shape[0]/2)].to_frame().transpose())
              .mark_text(
                  align='center',
                  baseline='top',
                  color = '#0566AA',
                  fontSize = 12,
                  dx = 0,
                  dy = -25
              ).encode(
                  x='sales',
                  text=alt.Text('sales', format='$,.0f')
              )
             )

point = first_half
vertical_line_left = (alt
                 .Chart(pd.DataFrame({'x': [point]}))
                 .mark_rule(
                     color = 'black',
                     strokeWidth = 0.7
                 )
                 .encode(
                     x = 'x:Q',
                     y = alt.value(27),
                     y2 = alt.value(53)
                 )
                )

point = second_half
vertical_line_right = (alt
                 .Chart(pd.DataFrame({'x': [point]}))
                 .mark_rule(
                     color = 'black',
                     strokeWidth = 0.7
                 )
                 .encode(
                     x = 'x:Q',
                     y = alt.value(27),
                     y2 = alt.value(53)
                 )
                )


chart = (box + points + annotation_left + vertical_line_left + annotation_right + vertical_line_right).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 80
)

chart

The plot above shows how the sorted dataset is evenly split into two groups with the same number of elements. In this case, there are different ways to calculate the median, with the linear interpolation the most common one. The 2D linear interpolation general equation is given by:

$$
y = y_{1}+(x-x_{1}) \frac{y_{2}-y_{1}}{x_{2}-x_{1}}
$$

Where:
- $x_{1}$ and $y_{1}$ are the first coordinates
- $x_{2}$ and $y_{2}$ are the second coordinates
- $x$ is the point to perform the interpolation
- $y$ is the interpolated value

In our case, the median is the middle point beween the upper bond of the first half and the lower bond of the second half after sorting the dataset. 

Let's now write down a function to calculate the median by considering the case in which the size of the dataset is an even or an odd number of elements. Summing up:
1. Sort all elements in the dataset from the smallest to largest
2. Calculate the position of the middle value
3. If the size of the dataset is an odd number, return the middle value, otherwise average the two middle values 

In [10]:
def dataset_median(dataset):
    """
    Compute the median of a dataset using linear interpolation

    Parameters:
    -----------
    dataset : array-like
        Array of observations (pandas Series)
    
    Returns:
    --------
    float
        The computed quantile value
    """
    
    midpoint = len(dataset)/2
    
    if midpoint%2 != 0:
        # if the lenths of the dataset is odd, return the middle point 
        return(dataset.sort_values().iloc[int(midpoint)])
    # otherwise, return the arithmetic mean of the two middle points
    return((dataset.sort_values().iloc[int(midpoint)-1]+dataset.sort_values().iloc[int(midpoint)])/2)

Finally, we can compare our function with `pandas` and `numpy` built-it functions.

In [11]:
dataset = sales['sales']

print("size of the dataset = {}".format(len(dataset)))
print("median using the diy function: {}".format(dataset_median(dataset)))
print("median using pandas median   : {}".format(dataset.median()))
print("median using numpy median   : {}".format(np.median(dataset)))
## printing the Sub-Category
print("Sub-Category: {}".format(sales[sales['sales'] == dataset_median(sales['sales'])]['Sub-Category'].values[0]))

dataset = sales_even['sales']

print("\nsize of the dataset = {}".format(len(dataset)))
print("median using the diy function: {}".format(dataset_median(dataset)))
print("median using pandas median   : {}".format(dataset.median()))
print("median using numpy median   : {}".format(np.median(dataset)))

size of the dataset = 17
median using the diy function: 114880.0
median using pandas median   : 114880.0
median using numpy median   : 114880.0
Sub-Category: Bookcases

size of the dataset = 16
median using the diy function: 128530.0
median using pandas median   : 128530.0
median using numpy median   : 128530.0
