In [4]:
import pandas as pd
import numpy as np
from src.funct import sum_and_round, mean_and_round, plot_boxplot_with_points


import matplotlib.pyplot as plt
import seaborn as sns

import altair as alt

In [5]:
data = pd.read_excel('./data/superstore_data.xlsx')
print(data.dtypes)
data.head()

Row ID                    int64
Order ID                 object
Order Date       datetime64[ns]
Ship Date        datetime64[ns]
Ship Mode                object
Customer ID              object
Customer Name            object
Segment                  object
Country                  object
City                     object
State                    object
Postal Code               int64
Region                   object
Product ID               object
Category                 object
Sub-Category             object
Product Name             object
Sales                   float64
Quantity                  int64
Discount                float64
Profit                  float64
dtype: object


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [6]:
data = data[['Region', 'Sub-Category', 'Sales', 'Quantity', 'Discount', 'Profit']]

In [7]:
boxplot_data = (data
 .groupby(['Sub-Category'])
 .agg(
     sales = ('Sales', sum_and_round),
     quantity = ('Quantity', 'sum'),
     discount = ('Discount', mean_and_round),
     profit = ('Profit', sum_and_round)
 )
).reset_index()

boxplot_data.head()

Unnamed: 0,Sub-Category,sales,quantity,discount,profit
0,Accessories,167380.32,2976,0.0785,41936.64
1,Appliances,107532.16,1729,0.1665,18138.01
2,Art,27118.79,3000,0.0749,6527.79
3,Binders,203412.73,5974,0.3723,30221.76
4,Bookcases,114880.0,868,0.2111,-3472.56


In [14]:
measure = 'discount'
measure_encode = measure + ':Q'

points = (alt
          .Chart(boxplot_data)
          .mark_point(size = 50, filled=True, color = '#B07AA1')
          .encode(
              x = alt.X(measure),
              tooltip = ['Sub-Category:N', measure]                        
          )
         )

box = (alt
       .Chart(boxplot_data)
       .mark_boxplot(size = 25, color = 'silver')
       .encode(
           x = alt.X(measure),
       )
      )
       

chart = (box + points).properties(
    title = measure + ' by Sub-category box plot',
    width = 620,
    height = 80
)

chart

In [15]:
sales = boxplot_data[['Sub-Category', 'sales']]

In [32]:
points = (alt
          .Chart(sales)
          .mark_point(size = 50, filled=True, stroke='black',
                      strokeWidth=1,color = '#9C755F')
          .encode(
              x = alt.X('sales:Q'),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

box = (alt
       .Chart(sales)
       .mark_boxplot(size = 25, opacity = 0.8, color = '#76B7B2')
       .encode(
           x = alt.X('sales:Q'),
#           y = 'Region'
       )
      )
       

chart = (box + points).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 80
)

chart

## Median

- Definition

> The ***median*** of a data set is its middle number when sorted in an ascending or descending order. It's the point above and below which $50\%$ of the observed data falls so it represents the midpoint of the data. 

The median is also said to be the $1/2$ quantile, and is a cut point dividing the dataset in its first and second half of data. For discrete sets of data point, we can have one median or multiple median points. Consider the sum of sales for each Sub-Category of the Super Store dataset. This dataset contains an odd number of elements. If we have an odd number of elements and we sort it in ascending or descending order then its middle element is the median of the set, as exactly the $50/%$ of its elements will be less this number and $50/%$ of the data will be greater. The middle point itself, i.e. the median, in this case is not properly contained in one of the halves, but represent the cut point indicating the middle of the set.

In our example, as we have 17 data points, we first sort them in ascending order and then pick up its  9<sup>th</sup> element as the median of the dataset.

In [None]:
# in Python, indexing starts at 0
middle_index = int(sales.shape[0]/2)
print("Median of the Dataset:\n{}".format(sales.sort_values('sales').iloc[middle_index]))

The following plot shows the boxplot with its elements and the middle point:

In [None]:
from altair import datum #Needed for subsetting (transforming data)

median = sales.sort_values('sales').iloc[middle_index]['sales']

points = (alt
          .Chart(sales)
          .mark_point(size = 50, 
                      filled=True,
                      stroke='black',
                      strokeWidth=1)
          .encode(
              x = alt.X('sales:Q'),
              color = alt.condition(
                  alt.datum.sales > median,
                  alt.value("steelblue"),
                  alt.value("orange")
              ),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

middle_point = (alt
          .Chart(sales[sales['sales'] == median])
          .mark_point(size = 50, 
                      filled=True,
                      stroke='black',
                      strokeWidth=1)
          .encode(
              x = alt.X('sales:Q'),
              color = alt.value("#484847"),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

vertical_line_middle_point = (alt
                 .Chart(pd.DataFrame({'x': [median]}))
                 .mark_rule(
                     color = 'black',
                     strokeWidth = 0.7
                 )
                 .encode(
                     x = 'x:Q',
                     y = alt.value(27),
                     y2 = alt.value(53)
                 )
                )


box = (alt
       .Chart(sales)
       .mark_boxplot(size = 25,
                     color='silver')
       .encode(
           x = alt.X('sales:Q'),
#           y = 'Region'
       )
      )
       
annotation = (alt
              .Chart(sales)
              .mark_text(
                  align='center',
                  baseline='top',
                  fontSize = 12,
                  dx = 0,
                  dy = -30
              ).encode(
                  x='sales',
                  text=alt.Text('sales', format='$,.0f')
              ).transform_filter(
                  (datum.sales == median)
              )
             )

chart = (box   + points + middle_point + vertical_line_middle_point + annotation).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 80
)

chart

What happens when we have an even number of data points? In this case, we can exactly divide the dataset in two subsets with the same number of element each ($50/%$) and so the median is not a datapoint of the dataset but, more formally, the interval of all the values between the upper bound of the half lowest points and the lower point of the second half. To visualize this concept, let's remove the median point from our sales dataset and consider the remaining 16 elements, resulting in an even-sized dataset.

In [33]:
sales_even = (sales
              .sort_values('sales')
              .reset_index(drop = True)
              .drop(int(sales.shape[0]/2))
              .reset_index(drop = True)
             )

sales_even.shape[0]

16

In [None]:
from altair import datum #Needed for subsetting (transforming data)
first_half = sales_even.iloc[int(sales_even.shape[0]/2)-1]['sales']
second_half = sales_even.iloc[int(sales_even.shape[0]/2)]['sales']

box = (alt
       .Chart(sales_even)
       .mark_boxplot(size = 25,
                    opacity=0.9,
                     color='silver')
       .encode(
           x = alt.X('sales:Q'),
#           y = 'Region'
       )
      )

points = (alt
          .Chart(sales_even)
          .mark_point(size = 50, 
                      filled=True, 
                      stroke='black',
                      strokeWidth=1
                     )
          .encode(
              x = alt.X('sales:Q'),
              color = alt.condition(
                  alt.datum.sales > first_half,
                  alt.value("steelblue"),
                  alt.value("orange")
              ),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

annotation_left = (alt
              .Chart(sales_even.iloc[int(sales_even.shape[0]/2)-1].to_frame().transpose())
              .mark_text(
                  align='center',
                  baseline='top',
                  color = '#aa4905',
                  fontSize = 12,
                  dx = 0,
                  dy = 18
              ).encode(
                  x='sales',
                  text=alt.Text('sales', format='$,.0f')
              )
             )


annotation_right = (alt
              .Chart(sales_even.iloc[int(sales_even.shape[0]/2)].to_frame().transpose())
              .mark_text(
                  align='center',
                  baseline='top',
                  color = '#0566AA',
                  fontSize = 12,
                  dx = 0,
                  dy = -25
              ).encode(
                  x='sales',
                  text=alt.Text('sales', format='$,.0f')
              )
             )

point = first_half
vertical_line_left = (alt
                 .Chart(pd.DataFrame({'x': [point]}))
                 .mark_rule(
                     color = 'black',
                     strokeWidth = 0.7
                 )
                 .encode(
                     x = 'x:Q',
                     y = alt.value(27),
                     y2 = alt.value(53)
                 )
                )

point = second_half
vertical_line_right = (alt
                 .Chart(pd.DataFrame({'x': [point]}))
                 .mark_rule(
                     color = 'black',
                     strokeWidth = 0.7
                 )
                 .encode(
                     x = 'x:Q',
                     y = alt.value(27),
                     y2 = alt.value(53)
                 )
                )


chart = (box + points + annotation_left + vertical_line_left + annotation_right + vertical_line_right).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 80
)

chart

The above plot shows how the sorted dataset is evenly split into two groups with the same number of elements. In this case, there are different ways to calculate the median, with the linear interpolation the most common one. The 2D linear interpolation general equation is given by:

$$
y = y_{1}+(x-x_{1}) \frac{y_{2}-y_{1}}{x_{2}-x_{1}}
$$

Where:
- $x_{1}$ and $y_{1}$ are the first coordinates
- $x_{2}$ and $y_{2}$ are the second coordinates
- $x$ is the point to perform the interpolation
- $y$ is the interpolated value

In our case, the median is the middle point beween the upper bond of the first half and the lower bond of the second half after sorting the dataset. 

Let's now write down a function to calculate the median by considering the case in which the size of the dataset is an even or an odd number of elements. Summing up:
1. Sort all elements in the dataset from the smallest to largest
2. Calculate the position of the middle value
3. If the size of the dataset is an odd number, return the middle value, otherwise average the two middle values 

In [None]:
def dataset_median(dataset):
    """
    Compute the median of a dataset using linear interpolation

    Parameters:
    -----------
    dataset : array-like
        Array of observations (pandas Series)
    
    Returns:
    --------
    float
        The computed quantile value
    """
    
    midpoint = len(dataset)/2
    
    if midpoint%2 != 0:
        # if the lenths of the dataset is odd, return the middle point 
        return(dataset.sort_values().iloc[int(midpoint)])
    # otherwise, return the arithmetic mean of the two middle points
    return((dataset.sort_values().iloc[int(midpoint)-1]+dataset.sort_values().iloc[int(midpoint)])/2)

In [None]:
# Comparing the dataset_median function with builtin median

dataset = sales['sales']

print("size of the dataset = {}".format(len(dataset)))
print("median using the diy function: {}".format(dataset_median(dataset)))
print("median using pandas median   : {}".format(dataset.median()))
print("median using numpy median   : {}".format(np.median(dataset)))
## printing the Sub-Category
print("Sub-Category: {}".format(sales[sales['sales'] == dataset_median(sales['sales'])]['Sub-Category'].values[0]))

dataset = sales_even['sales']

print("\nsize of the dataset = {}".format(len(dataset)))
print("median using the diy function: {}".format(dataset_median(dataset)))
print("median using pandas median   : {}".format(dataset.median()))
print("median using numpy median   : {}".format(np.median(dataset)))

In [None]:
first_half = sales_even.iloc[int(sales_even.shape[0]/2)-1]['sales']
second_half = sales_even.iloc[int(sales_even.shape[0]/2)]['sales']

median = dataset_median(sales_even['sales'])

box = (alt
       .Chart(sales_even)
       .mark_boxplot(size = 25,
                    opacity=0.9,
                     color='silver')
       .encode(
           x = alt.X('sales:Q'),
#           y = 'Region'
       )
      )

points = (alt
          .Chart(sales_even)
          .mark_point(size = 50, 
                      filled=True, 
                      stroke='black',
                      strokeWidth=1
                     )
          .encode(
              x = alt.X('sales:Q'),
              color = alt.condition(
                  alt.datum.sales > first_half,
                  alt.value("steelblue"),
                  alt.value("orange")
              ),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )


annotation = (alt
              .Chart(pd.DataFrame({'median':[median]}))
              .mark_text(
                  align='center',
                  baseline='top',
                  fontSize = 12,
                  dx = 0,
                  dy = -25
              ).encode(
                  x='median',
                  text=alt.Text('median', format = "$,.0f")
              )
             )

point = median
vertical_line_median = (alt
                 .Chart(pd.DataFrame({'x': [point]}))
                 .mark_rule(
                     color = 'black',
                     strokeWidth = 0.7
                 )
                 .encode(
                     x = 'x:Q',
                     y = alt.value(27),
                     y2 = alt.value(53)
                 )
                )



chart = (box + points + vertical_line_median + annotation).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 80
)

chart

## Quartile

In statistics, ***quantiles*** are particular points dividing a sample into equally sized, adjacent subgroups. As an example, the median is a quantiles so that exactly half of the data is lower than the median and half of the data is above the median: the median is said to be the 2<sup>nd</sup> quartile.

***Quartiles*** divide the distribution into four equal part.

In our examples, `altair` boxplot provide `Q1` and `Q3` paramenters, meaning the first and third quartile. The first quartile, `Q1`, indicates the point for which the $25\%$ of data are below this number. Similarly, the 3<sup>rd</sup> quartile, `Q3`, is the point for which the $75\%$ of the data, are below this number. In our examples, the original dataset containing 17 points the boxplot reports:
- `Q1` = 46674.54
- `Q3` = 203412.73

In the second one, without the median point, we have:
- `Q1` = 41784.85
- `Q3` = 204300.93

How `altair` calculated this point? Let's start with the first quartile, i.e. the $25\%$ of the data. Breaking down in steps what we need to do to calculate it, we need first to sort the data, and then find the first $25\%$ elements of the sorted array. 


## Whiskers and IQR

In [None]:
boxplot_data.head()

In [None]:
plot_boxplot_with_points(boxplot_data,
                        'Sub-Category',
                        'discount')

In [None]:
q25, q75 = np.quantile(boxplot_data['discount'], [0.25, 0.75])
iqr = (q75-q25)
iqr

In [None]:
print(q25)
q25-1.5*iqr

In [None]:
q75+1.5*iqr

In [None]:
boxplot_data['discount']

In [None]:
from altair import datum #Needed for subsetting (transforming data)
median_value =dataset_median(sales_even['sales'])

box = (alt
       .Chart(sales_even)
       .mark_boxplot(size = 25,
                    opacity=0.9,
                     color='silver')
       .encode(
           x = alt.X('sales:Q'),
#           y = 'Region'
       )
      )

points = (alt
          .Chart(sales_even)
          .mark_point(size = 50, 
                      filled=True, 
                      opacity=1.0, 
                      stroke='black',
                      strokeWidth=1
                     )
          .encode(
              x = alt.X('sales:Q'),
              color = alt.condition(
                  alt.datum.sales > median_value,
                  alt.value("steelblue"),
                  alt.value("orange")
              ),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )


chart = (box + points).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 80
)

chart

In [None]:
data

In [None]:
boxplot_data = (data
 .groupby(['Region', 'Sub-Category'])
 .agg(
     sales = ('Sales', sum_and_round),
     quantity = ('Quantity', 'sum'),
     discount = ('Discount', mean_and_round),
     profit = ('Profit', sum_and_round)
 )
).reset_index()

In [None]:
boxplot_data.head()

In [None]:
boxplot_data = (data
 .groupby(['Sub-Category'])
 .agg(
     sales = ('Sales', sum_and_round),
     quantity = ('Quantity', 'sum'),
     discount = ('Discount', mean_and_round),
     profit = ('Profit', sum_and_round)
 )
).reset_index()

boxplot_data.head()

In [None]:
plot_boxplot_with_points(boxplot_data, 'Sub-Category', 'profit')

In [None]:
boxplot_data

In [None]:
(alt
 .Chart(boxplot_data)
 .mark_boxplot(size = 10)
 .encode(
     x = alt.X('profit:Q'),
     y = 'Region',
     color = 'Region'
 )
)

In [None]:
points = (alt
          .Chart(boxplot_data)
          .mark_point(size = 50, filled=True, opacity=0.8, color = '#953f0a')
          .encode(
              x = alt.X('profit:Q'),
              y = 'Region',
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

box = (alt
       .Chart(boxplot_data)
       .mark_boxplot(size = 25)
       .encode(
           x = alt.X('profit:Q'),
           y = 'Region'
       )
      )
       

chart = (box + points).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 180
)

chart

## Median

- Definition

> The ***median*** of a data set is its middle number when sorted in an ascending or descending order. It's the point above and below which $50\%$ of the observed data falls so it represents the midpoint of the data. 

The median is also said to be the $1/2$ quantile, and is a cut point dividing the dataset in its first and second half of data. For discrete sets of data point, we can have one median or multiple median points. Consider the sum of sales for each Sub-Category of the Super Store datased.

## Quartile

To describe how to compute quartiles, let's consider two data set with an even and an odd number of elements

In [None]:
df_subcategory_sales = (data
 .groupby('Sub-Category', observed=True)
 .agg(
     profit = ('Profit', sum_and_round),
 )
).sort_values('profit').reset_index()

In [None]:
df_subcategory_sales.shape[0]

In [None]:
points = (alt
          .Chart(df_subcategory_sales)
          .mark_point(size = 50, filled=True, opacity=0.8, color = '#953f0a')
          .encode(
              x = alt.X('profit:Q'),
              tooltip = ['Sub-Category:N', 'profit:Q']                        
          )
         )

box = (alt
       .Chart(df_subcategory_sales)
       .mark_boxplot(size = 25)
       .encode(
           x = alt.X('profit:Q'),
#           y = 'Region'
       )
      )
       

chart = (box + points).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 180
)

chart

In [None]:
p = 0.25
p*(df_subcategory_sales.shape[0]-1)

In [None]:
np.quantile(df_subcategory_sales['profit'], p)

In [None]:
data = 100*np.random.uniform(size = 5)
test_df = pd.DataFrame(
    {
        'x' : data
    }
)

In [None]:
points = (alt
          .Chart(test_df)
          .mark_point(size = 50, filled=True, opacity=0.8, color = '#953f0a')
          .encode(
              x = alt.X('x:Q'),
              tooltip = ['x:Q']                        
          )
         )

box = (alt
       .Chart(test_df)
       .mark_boxplot(size = 25)
       .encode(
           x = alt.X('x:Q'),
#           y = 'Region'
       )
      )
       

chart = (box + points).properties(
    title = 'Boxplot',
    width = 620,
    height = 180
)

chart

In [None]:
np.quantile(data, 0.25)

In [None]:
len(data[data < np.quantile(data, 0.25)])/len(data)

In [None]:
4/17

In [None]:
4/16

In [None]:
data

In [None]:
def mquintile(data, p):
    """
    data: np array, list, pandas series is an array of observations
    p: float between 0 and 1, is the percentage of samples you want to consider
    """

    samples = np.sort(data)
    # n is the position of the sorted array containing the samples in the desired quantile
    n = p*(len(samples)-1)
    if n%2 == 0:
        # if the position is an even number, return the sample at that position 
        print(samples[int(n)])
        return(samples[int(n)])
    else:
        # is the position is an odd number, we compute the the values of the sorted array
        # for the considered position
        pos = int(n)
        # compute the adiacent samples to interpole to compute the quartile
        lower_sample = samples[pos]
        upper_sample = samples[pos+1]
        print("lower sample = {}, upper sample {}".format(lower_sample, upper_sample))
        # compute the fraction of sample to use in the interpolation
        f = n-pos
        print("fraction = {}".format(f))
        # Finally, calculate the interpolated point representing the quantile
        quantile = lower_sample+(f * (upper_sample-lower_sample))
        print("quantile value = {}".format(quantile))
        return(quantile)

In [None]:
mquintile(data, 0.5)

In [None]:
data

In [None]:
import altair as alt
import pandas as pd
import numpy as np

# Create sample data
np.random.seed(42)
data = pd.DataFrame({
    'category': ['A'] * 50,
    'value': np.random.normal(100, 15, 50)
})

# Create the base boxplot
boxplot = alt.Chart(data).mark_boxplot(
    size=40,
    opacity=0.3,
    color='lightgray'
).encode(
    x='category:N',
    y='value:Q'
)

# Create points with different colors based on position relative to median
points = alt.Chart(data).mark_circle(
    size=60,
    opacity=0.6
).encode(
    x='category:N',
    y='value:Q',
    color=alt.condition(
        'datum.value >= median(value)',
        alt.value('red'),    # color if true
        alt.value('blue')    # color if false
    )
)

# Combine the two charts
chart = (boxplot + points).properties(
    width=200,
    height=300,
    title='Boxplot with Points Colored by Median Position'
)

# Configure the chart theme
chart = chart.configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_title(
    fontSize=16
)

chart

In [None]:
# annotation1 = (alt
#               .Chart(sales_even)
#               .mark_text(
#                   align='center',
#                   baseline='top',
#                   fontSize = 12,
#                   dx = 0,
#                   dy = -30
#               ).encode(
#                   x='sales',
#                   text='sales'
#               ).transform_filter(
#                   (datum.sales < median)
#               )
#              )

# annotation2 = (alt
#               .Chart(sales_even)
#               .mark_text(
#                   align='center',
#                   baseline='top',
#                   fontSize = 12,
#                   dx = 0,
#                   dy = 30
#               ).encode(
#                   x='sales',
#                   text='sales'
#               ).transform_filter(
#                   (datum.sales > median)
#               )
#              )


In [None]:
# Sample data
data = pd.DataFrame({
    'category': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'],
    'value': [1, 2, 3, 4, 5, 5, 6, 7, 8, 9]
})

# Define the position for the vertical line (e.g., at y = 6)
vertical_line_position = 6

# Create the boxplot
boxplot = alt.Chart(data).mark_boxplot().encode(
    x='category:N',
    y='value:Q'
)

# Add the vertical line at a specific y position
vertical_line = alt.Chart(pd.DataFrame({'y': [vertical_line_position]})).mark_rule(
    color='red',
    strokeWidth=2
).encode(
    y='y:Q',
)

# Combine boxplot and vertical line
boxplot_with_line = boxplot + vertical_line
boxplot_with_line