In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import altair as alt

In [2]:
data = pd.read_excel('./data/superstore_data.xlsx')
print(data.dtypes)
data.head()

Row ID                    int64
Order ID                 object
Order Date       datetime64[ns]
Ship Date        datetime64[ns]
Ship Mode                object
Customer ID              object
Customer Name            object
Segment                  object
Country                  object
City                     object
State                    object
Postal Code               int64
Region                   object
Product ID               object
Category                 object
Sub-Category             object
Product Name             object
Sales                   float64
Quantity                  int64
Discount                float64
Profit                  float64
dtype: object


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [3]:
data = data[['Region', 'Sub-Category', 'Sales', 'Quantity', 'Discount', 'Profit']]

In [4]:
def sum_and_round(x):
    if np.issubdtype(x.dtype, np.floating):
        return round(x.sum(), 0)
    else:
        return x.sum()

In [5]:
def mean_and_round(x):
    if np.issubdtype(x.dtype, np.floating):
        return round(x.mean(), 2)
    else:
        return x.mean()

In [9]:
df_subcategory_sales = (data
 .groupby('Sub-Category', observed=True)
 .agg(
     sales = ('Sales', sum_and_round),
 )
).reset_index()

In [21]:
points = (alt
          .Chart(df_subcategory_sales)
          .mark_point(size = 50, filled=True, opacity=0.8, color = '#953f0a')
          .encode(
              x = alt.X('sales:Q'),
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

box = (alt
       .Chart(df_subcategory_sales)
       .mark_boxplot(size = 25)
       .encode(
           x = alt.X('sales:Q'),
#           y = 'Region'
       )
      )
       

chart = (box + points).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 180
)

chart

## Median

- Definition

> The ***median*** of a data set is its middle number when sorted in an ascending or descending order. It's the point above and below which $50\%$ of the observed data falls so it represents the midpoint of the data. 

The median is also said to be the $1/2$ quantile, and is a cut point dividing the dataset in its first and second half of data. For discrete sets of data point, we can have one median or multiple median points. Consider the sum of sales for each Sub-Category of the Super Store dataset. This dataset contains 17 elements. If we order the data, its 9<sup>th</sup> element is the median of the dataset as 8 elements, i.e. the $50\%$ are smaller than the median and 8 elements are greater.

In [23]:
df_subcategory_sales.sort_values('sales')['sales'].values[:8]

array([  3024.,  12486.,  16476.,  27119.,  46674.,  78479.,  91705.,
       107532.])

In [24]:
df_subcategory_sales.sort_values('sales').iloc[8]

Sub-Category    Bookcases
sales            114880.0
Name: 4, dtype: object

In [26]:
df_subcategory_sales.sort_values('sales')['sales'].values[9:]

array([149528., 167380., 189239., 203413., 206966., 223844., 328449.,
       330007.])

In [None]:
data.columns

In [None]:
data = data[['Region', 'Sub-Category', 'Sales', 'Quantity', 'Discount', 'Profit']]
data.head()

In [None]:
def mean_and_round(x):
    if np.issubdtype(x.dtype, np.floating):
        return round(x.mean(), 2)
    else:
        return x.mean()

In [None]:
def sum_and_round(x):
    if np.issubdtype(x.dtype, np.floating):
        return round(x.sum(), 0)
    else:
        return x.sum()

In [None]:
def mean_and_round(x):
    if np.issubdtype(x.dtype, np.floating):
        return round(x.mean(), 2)
    else:
        return x.mean()

In [None]:
boxplot_data = (data
 .groupby(['Region', 'Sub-Category'])
 .agg(
     sales = ('Sales', sum_and_round),
     quantity = ('Quantity', 'sum'),
     discount = ('Discount', mean_and_round),
     profit = ('Profit', sum_and_round)
 )
).reset_index()

In [None]:
boxplot_data

In [None]:
(alt
 .Chart(boxplot_data)
 .mark_boxplot(size = 10)
 .encode(
     x = alt.X('profit:Q'),
     y = 'Region',
     color = 'Region'
 )
)

In [18]:
points = (alt
          .Chart(boxplot_data)
          .mark_point(size = 50, filled=True, opacity=0.8, color = '#953f0a')
          .encode(
              x = alt.X('profit:Q'),
              y = 'Region',
              tooltip = ['Sub-Category:N', 'sales:Q']                        
          )
         )

box = (alt
       .Chart(boxplot_data)
       .mark_boxplot(size = 25)
       .encode(
           x = alt.X('profit:Q'),
           y = 'Region'
       )
      )
       

chart = (box + points).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 180
)

chart

NameError: name 'boxplot_data' is not defined

## Median

- Definition

> The ***median*** of a data set is its middle number when sorted in an ascending or descending order. It's the point above and below which $50\%$ of the observed data falls so it represents the midpoint of the data. 

The median is also said to be the $1/2$ quantile, and is a cut point dividing the dataset in its first and second half of data. For discrete sets of data point, we can have one median or multiple median points. Consider the sum of sales for each Sub-Category of the Super Store datased.

## Quartile

To describe how to compute quartiles, let's consider two data set with an even and an odd number of elements

In [None]:
df_subcategory_sales = (data
 .groupby('Sub-Category', observed=True)
 .agg(
     profit = ('Profit', sum_and_round),
 )
).sort_values('profit').reset_index()

In [None]:
df_subcategory_sales.shape[0]

In [19]:
points = (alt
          .Chart(df_subcategory_sales)
          .mark_point(size = 50, filled=True, opacity=0.8, color = '#953f0a')
          .encode(
              x = alt.X('profit:Q'),
              tooltip = ['Sub-Category:N', 'profit:Q']                        
          )
         )

box = (alt
       .Chart(df_subcategory_sales)
       .mark_boxplot(size = 25)
       .encode(
           x = alt.X('profit:Q'),
#           y = 'Region'
       )
      )
       

chart = (box + points).properties(
    title = 'Sales by Sub-category box plot',
    width = 620,
    height = 180
)

chart

In [None]:
p = 0.25
p*(df_subcategory_sales.shape[0]-1)

In [None]:
np.quantile(df_subcategory_sales['profit'], p)

In [None]:
data = 100*np.random.uniform(size = 5)
test_df = pd.DataFrame(
    {
        'x' : data
    }
)

In [None]:
points = (alt
          .Chart(test_df)
          .mark_point(size = 50, filled=True, opacity=0.8, color = '#953f0a')
          .encode(
              x = alt.X('x:Q'),
              tooltip = ['x:Q']                        
          )
         )

box = (alt
       .Chart(test_df)
       .mark_boxplot(size = 25)
       .encode(
           x = alt.X('x:Q'),
#           y = 'Region'
       )
      )
       

chart = (box + points).properties(
    title = 'Boxplot',
    width = 620,
    height = 180
)

chart

In [None]:
np.quantile(data, 0.25)

In [None]:
len(data[data < np.quantile(data, 0.25)])/len(data)

In [None]:
4/17

In [None]:
4/16

In [None]:
data

In [None]:
def mquintile(data, p):
    """
    data: np array, list, pandas series is an array of observations
    p: float between 0 and 1, is the percentage of samples you want to consider
    """

    samples = np.sort(data)
    # n is the position of the sorted array containing the samples in the desired quantile
    n = p*(len(samples)-1)
    if n%2 == 0:
        # if the position is an even number, return the sample at that position 
        print(samples[int(n)])
        return(samples[int(n)])
    else:
        # is the position is an odd number, we compute the the values of the sorted array
        # for the considered position
        pos = int(n)
        # compute the adiacent samples to interpole to compute the quartile
        lower_sample = samples[pos]
        upper_sample = samples[pos+1]
        print("lower sample = {}, upper sample {}".format(lower_sample, upper_sample))
        # compute the fraction of sample to use in the interpolation
        f = n-pos
        print("fraction = {}".format(f))
        # Finally, calculate the interpolated point representing the quantile
        quantile = lower_sample+(f * (upper_sample-lower_sample))
        print("quantile value = {}".format(quantile))
        return(quantile)

In [None]:
mquintile(data, 0.5)

In [None]:
data