# Ex 5.4 Creating and Using Date Columns:  Years, Quarters and Months
-  Data:  **Sample-Superstore_Cleaned.csv**


- [**Part 1: Average Profit Ratio by Year**](#Part-1:--Average-Profit-Ratio-by-Year)   
- [**Part 2: Quarterly Sales by Region**](#Part-2:-Quarterly-Sales-by-Region)  
- [**Part 3: Total Sales and Profit by Quarter**](#Part-3:-Total-Sales-and-Profit-by-Quarter) 
- [**Part 4: Monthly Sales by Segment**](#Part-4:-Monthly-Sales-by-Segment) 
 


In [88]:
from IPython.display import display, HTML
import pandas as pd
import math

import plotly.graph_objects as go
import plotly.express as px
import numpy as np
from scipy import special

import datetime as ts

#### Read Data file

In [89]:
#Read the csv file into a pandas dataframe
df = pd.read_csv('Data/Sample-Superstore_Cleaned.csv')
df.head(2)

Unnamed: 0,Profit Ratio,Category,City,Country,Customer Name,Discount,Number of Records,Order Date,Order ID,Postal Code,...,Product Name,Profit,Quantity,Region,Sales,Segment,Ship Date,Ship Mode,State,Sub-Category
0,0.16,Furniture,Henderson,United States,Claire Gute,0.0,1,11/8/2018,CA-2018-152156,42420.0,...,Bush Somerset Collection Bookcase,42,2,South,262,Consumer,11/11/2018,Second Class,Kentucky,Bookcases
1,0.3,Furniture,Henderson,United States,Claire Gute,0.0,1,11/8/2018,CA-2018-152156,42420.0,...,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",220,3,South,732,Consumer,11/11/2018,Second Class,Kentucky,Chairs


In [90]:
print("Rows in dataframe:  ", df.shape[0])
print("Columns in dataframe:  " , df.shape[1])

Rows in dataframe:   9994
Columns in dataframe:   21


### Data Cleanup (as needed)

In [91]:
df.dtypes

Profit Ratio         float64
Category              object
City                  object
Country               object
Customer Name         object
Discount             float64
Number of Records      int64
Order Date            object
Order ID              object
Postal Code          float64
Manufacturer          object
Product Name          object
Profit                 int64
Quantity               int64
Region                object
Sales                  int64
Segment               object
Ship Date             object
Ship Mode             object
State                 object
Sub-Category          object
dtype: object

In [92]:
# Change Date columns to DateTime data types

df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])

# Change Sales and Profit to floats
df['Profit'] = pd.to_numeric(df['Profit']).astype(float)
df['Sales'] = pd.to_numeric(df['Sales']).astype(float)

In [93]:
df.dtypes

Profit Ratio                float64
Category                     object
City                         object
Country                      object
Customer Name                object
Discount                    float64
Number of Records             int64
Order Date           datetime64[ns]
Order ID                     object
Postal Code                 float64
Manufacturer                 object
Product Name                 object
Profit                      float64
Quantity                      int64
Region                       object
Sales                       float64
Segment                      object
Ship Date            datetime64[ns]
Ship Mode                    object
State                        object
Sub-Category                 object
dtype: object

# Part 1:  Average Profit Ratio by Year 
- Create a new column named **order_year** based on the Order Date column 
- Group on order_year and plot Profit Ratio in a Vertical Bar Chart  
  - We want the Average Profit Ratio, not a sum  
  

- **My References**  
  - [**Date Columns**](../0_References/1_Pandas_Reference/DateTime_Columns.ipynb#Create-new-Columns-based-on-a-DateTime-column)

In [94]:
df['order_year'] = df['Order Date'].dt.year
df['order_year_quarter'] = df['Order Date'].dt.to_period('Q')

print("Rows in dataframe:  ", df.shape[0])
print("Columns in dataframe:  " , df.shape[1])
df.head()

Rows in dataframe:   9994
Columns in dataframe:   23


Unnamed: 0,Profit Ratio,Category,City,Country,Customer Name,Discount,Number of Records,Order Date,Order ID,Postal Code,...,Quantity,Region,Sales,Segment,Ship Date,Ship Mode,State,Sub-Category,order_year,order_year_quarter
0,0.16,Furniture,Henderson,United States,Claire Gute,0.0,1,2018-11-08,CA-2018-152156,42420.0,...,2,South,262.0,Consumer,2018-11-11,Second Class,Kentucky,Bookcases,2018,2018Q4
1,0.3,Furniture,Henderson,United States,Claire Gute,0.0,1,2018-11-08,CA-2018-152156,42420.0,...,3,South,732.0,Consumer,2018-11-11,Second Class,Kentucky,Chairs,2018,2018Q4
2,0.47,Office Supplies,Los Angeles,United States,Darrin Van Huff,0.0,1,2018-06-12,CA-2018-138688,90036.0,...,2,West,15.0,Corporate,2018-06-16,Second Class,California,Labels,2018,2018Q2
3,-0.4,Furniture,Fort Lauderdale,United States,Sean O'Donnell,0.45,1,2017-10-11,US-2017-108966,33311.0,...,5,South,958.0,Consumer,2017-10-18,Standard Class,Florida,Tables,2017,2017Q4
4,0.11,Office Supplies,Fort Lauderdale,United States,Sean O'Donnell,0.2,1,2017-10-11,US-2017-108966,33311.0,...,2,South,22.0,Consumer,2017-10-18,Standard Class,Florida,Storage,2017,2017Q4


In [95]:
df['order_year'].unique()

array([2018, 2017, 2016, 2019], dtype=int64)

### groupby

In [118]:
# Group by the new year column

df_by_year_mean = df.groupby('order_year')['Sales','Profit','Profit Ratio'].mean()

df_by_year_mean

Unnamed: 0_level_0,Sales,Profit,Profit Ratio
order_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016,242.977923,24.846964,0.118394
2017,223.862988,29.308278,0.117745
2018,235.506378,31.615771,0.130008
2019,221.402174,28.209843,0.116187


In [119]:
# Reset index

df_by_year_mean.reset_index(inplace=True)

In [120]:
df_by_year_mean.head(3)

Unnamed: 0,order_year,Sales,Profit,Profit Ratio
0,2016,242.977923,24.846964,0.118394
1,2017,223.862988,29.308278,0.117745
2,2018,235.506378,31.615771,0.130008


In [123]:
# Plot

fig = px.bar(df_by_year_mean,
            x='order_year',
            y='Profit Ratio',
             template='presentation',
             text='Profit Ratio',
             title='Part 1: Profit Ratio by Year')

fig.update_xaxes(showgrid=False,
                 title_text='',
                dtick=1)

fig.update_yaxes(dtick=0.02,
                tick0=0,
                 tickformat=',.0%')

fig.update_traces(textposition='auto',
                 texttemplate='%{text:.1%}')

fig.show()

# Part 2: Quarterly Sales by Region   
- Create a new column named **order_year_quarter** based on the Order Date column 
  - An example of what the values in that new columns look like is:  *2016Q1*  
- Group on order_quarter and Region
- Plot Profit Ratio (for each of the Regions) in a Multi-Line Chart  




- **My References**  
  - [**Date Columns**](../0_References/1_Pandas_Reference/DateTime_Columns.ipynb#Create-new-Columns-based-on-a-DateTime-column)

In [100]:
# Create a new column:  order_year_quarter
df['order_year_quarter'] = df['Order Date'].dt.to_period('Q')

print("Rows in dataframe:  ", df.shape[0])
print("Columns in dataframe:  " , df.shape[1])
df.head(2)

Rows in dataframe:   9994
Columns in dataframe:   23


Unnamed: 0,Profit Ratio,Category,City,Country,Customer Name,Discount,Number of Records,Order Date,Order ID,Postal Code,...,Quantity,Region,Sales,Segment,Ship Date,Ship Mode,State,Sub-Category,order_year,order_year_quarter
0,0.16,Furniture,Henderson,United States,Claire Gute,0.0,1,2018-11-08,CA-2018-152156,42420.0,...,2,South,262.0,Consumer,2018-11-11,Second Class,Kentucky,Bookcases,2018,2018Q4
1,0.3,Furniture,Henderson,United States,Claire Gute,0.0,1,2018-11-08,CA-2018-152156,42420.0,...,3,South,732.0,Consumer,2018-11-11,Second Class,Kentucky,Chairs,2018,2018Q4


### groupby

In [101]:
# Group by the new quarter order_year_quarter column and Region

df_by_quarter_sum = df.groupby(['order_year_quarter','Region'])['Sales','Profit','Profit Ratio'].sum()

df_by_quarter_sum.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Profit,Profit Ratio
order_year_quarter,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016Q1,Central,8602.0,138.0,-3.79
2016Q1,East,6580.0,-788.0,5.89
2016Q1,South,44260.0,2667.0,15.91
2016Q1,West,15003.0,1783.0,15.17
2016Q2,Central,17409.0,968.0,-1.17


In [102]:
# Reset index
df_by_quarter_sum.reset_index(inplace=True)
df_by_quarter_sum.head(2)

Unnamed: 0,order_year_quarter,Region,Sales,Profit,Profit Ratio
0,2016Q1,Central,8602.0,138.0,-3.79
1,2016Q1,East,6580.0,-788.0,5.89


In [103]:
df_by_quarter_sum.dtypes

order_year_quarter     object
Region                 object
Sales                 float64
Profit                float64
Profit Ratio          float64
dtype: object

In [104]:
# Convert the data type of the new column to string (plotly likes this data better for this chart)
df_by_quarter_sum['order_year_quarter'] = df_by_quarter_sum['order_year_quarter'].astype(str)

In [105]:
# Plot
fig = px.line(df_by_quarter_sum,
            x='order_year_quarter',
            y='Sales',
            color='Region',
            template='presentation',
            title='Part 2: Quarterly Sales by Region')

fig.show()


# Part 3: Total Sales and Profit by Quarter

In [106]:
# Group by the order_year_quarter column

df_P3 = df.groupby('order_year_quarter').sum()

In [107]:
# Reset index
df_P3.reset_index(inplace=True)
df_P3.head(2)

Unnamed: 0,order_year_quarter,Profit Ratio,Discount,Number of Records,Postal Code,Profit,Quantity,Sales,order_year
0,2016Q1,33.18,44.4,282,15706610.0,3800.0,1028,74445.0,568512
1,2016Q2,61.04,57.02,392,21049066.0,11201.0,1523,86540.0,790272


In [108]:
df_P3.dtypes

order_year_quarter     object
Profit Ratio          float64
Discount              float64
Number of Records       int64
Postal Code           float64
Profit                float64
Quantity                int64
Sales                 float64
order_year              int64
dtype: object

In [109]:
# Convert the data type of the new column to string (plotly likes this data better for this chart)
df_P3['order_year_quarter'] = df_P3['order_year_quarter'].astype(str)

In [110]:
df_P3.head(3)

Unnamed: 0,order_year_quarter,Profit Ratio,Discount,Number of Records,Postal Code,Profit,Quantity,Sales,order_year
0,2016Q1,33.18,44.4,282,15706610.0,3800.0,1028,74445.0,568512
1,2016Q2,61.04,57.02,392,21049066.0,11201.0,1523,86540.0,790272
2,2016Q3,72.2,87.47,564,32196388.0,12793.0,2159,143643.0,1137024


### Melt 

In [111]:
# Melt 
df_P3_melt = df_P3.melt(id_vars=['order_year_quarter'],
                        value_vars=['Sales','Profit'],
                        var_name='Measure Type',
                        value_name='Measure Value'
                        )
df_P3_melt.head(10)

Unnamed: 0,order_year_quarter,Measure Type,Measure Value
0,2016Q1,Sales,74445.0
1,2016Q2,Sales,86540.0
2,2016Q3,Sales,143643.0
3,2016Q4,Sales,179627.0
4,2017Q1,Sales,68858.0
5,2017Q2,Sales,89135.0
6,2017Q3,Sales,130265.0
7,2017Q4,Sales,182302.0
8,2018Q1,Sales,93243.0
9,2018Q2,Sales,136102.0


### Plot

In [112]:
# plot

fig = px.line(df_P3_melt,
             x='order_year_quarter',
             y='Measure Value',
             color='Measure Type',
              template='presentation',
             title='Part 3: Total Sales and Profit by Quarter')

fig.show()

# Part 4: Monthly Sales by Segment    


- **My References**  
  - [**Date Columns**](../0_References/1_Pandas_Reference/DateTime_Columns.ipynb#Create-new-Columns-based-on-a-DateTime-column)

In [113]:
df['order_month'] = df['Order Date'].dt.month
df['order_month_name'] = df['Order Date'].dt.strftime('%b')

df.head()

Unnamed: 0,Profit Ratio,Category,City,Country,Customer Name,Discount,Number of Records,Order Date,Order ID,Postal Code,...,Sales,Segment,Ship Date,Ship Mode,State,Sub-Category,order_year,order_year_quarter,order_month,order_month_name
0,0.16,Furniture,Henderson,United States,Claire Gute,0.0,1,2018-11-08,CA-2018-152156,42420.0,...,262.0,Consumer,2018-11-11,Second Class,Kentucky,Bookcases,2018,2018Q4,11,Nov
1,0.3,Furniture,Henderson,United States,Claire Gute,0.0,1,2018-11-08,CA-2018-152156,42420.0,...,732.0,Consumer,2018-11-11,Second Class,Kentucky,Chairs,2018,2018Q4,11,Nov
2,0.47,Office Supplies,Los Angeles,United States,Darrin Van Huff,0.0,1,2018-06-12,CA-2018-138688,90036.0,...,15.0,Corporate,2018-06-16,Second Class,California,Labels,2018,2018Q2,6,Jun
3,-0.4,Furniture,Fort Lauderdale,United States,Sean O'Donnell,0.45,1,2017-10-11,US-2017-108966,33311.0,...,958.0,Consumer,2017-10-18,Standard Class,Florida,Tables,2017,2017Q4,10,Oct
4,0.11,Office Supplies,Fort Lauderdale,United States,Sean O'Donnell,0.2,1,2017-10-11,US-2017-108966,33311.0,...,22.0,Consumer,2017-10-18,Standard Class,Florida,Storage,2017,2017Q4,10,Oct


### Groupby

In [114]:
df_by_month_sum = df.groupby(['order_month_name','order_month','Segment'])['Sales','Profit'].sum()

df_by_month_sum.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sales,Profit
order_month_name,order_month,Segment,Unnamed: 3_level_1,Unnamed: 4_level_1
Apr,4,Consumer,54855.0,3857.0
Apr,4,Corporate,49233.0,7273.0
Apr,4,Home Office,33687.0,460.0
Aug,8,Consumer,82328.0,10263.0
Aug,8,Corporate,57529.0,9509.0


In [115]:
# Reset index
df_by_month_sum.reset_index(inplace=True)
df_by_month_sum.head()

Unnamed: 0,order_month_name,order_month,Segment,Sales,Profit
0,Apr,4,Consumer,54855.0,3857.0
1,Apr,4,Corporate,49233.0,7273.0
2,Apr,4,Home Office,33687.0,460.0
3,Aug,8,Consumer,82328.0,10263.0
4,Aug,8,Corporate,57529.0,9509.0


In [116]:
# Sort by Month number (not month name)
df_by_month_sum.sort_values('order_month', ascending=True, inplace=True)
df_by_month_sum.head(2)

Unnamed: 0,order_month_name,order_month,Segment,Sales,Profit
14,Jan,1,Home Office,17698.0,4603.0
13,Jan,1,Corporate,26793.0,1351.0


In [117]:
# Plot
fig = px.bar(df_by_month_sum,
            x='order_month_name',
            y='Sales',
            color='Segment',
            template='presentation',
             barmode='group',
            title='Part 4: Monthly Sales by Segment')

fig.show()