In [1]:
import pandas as pd
workbook = pd.ExcelFile(path_or_buffer='/kaggle/input/coffee-bean-sales-raw-dataset/Raw Data.xlsx')
product_df = pd.read_excel(io=workbook, sheet_name='products')
product_df.head()

Unnamed: 0,Product ID,Coffee Type,Roast Type,Size,Unit Price,Price per 100g,Profit
0,A-L-0.2,Ara,L,0.2,3.885,1.9425,0.34965
1,A-L-0.5,Ara,L,0.5,7.77,1.554,0.6993
2,A-L-1,Ara,L,1.0,12.95,1.295,1.1655
3,A-L-2.5,Ara,L,2.5,29.785,1.1914,2.68065
4,A-M-0.2,Ara,M,0.2,3.375,1.6875,0.30375


In [2]:
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Product ID      48 non-null     object 
 1   Coffee Type     48 non-null     object 
 2   Roast Type      48 non-null     object 
 3   Size            48 non-null     float64
 4   Unit Price      48 non-null     float64
 5   Price per 100g  48 non-null     float64
 6   Profit          48 non-null     float64
dtypes: float64(4), object(3)
memory usage: 2.8+ KB


In [3]:
from plotly.express import scatter
scatter(data_frame=product_df, x='Unit Price', y='Price per 100g', color='Size', hover_name='Product ID', color_continuous_scale='hsv',
       hover_data=['Coffee Type', 'Roast Type'], )

In [4]:
scatter(data_frame=product_df, x='Unit Price', y='Profit', color='Size', hover_name='Product ID', color_continuous_scale='hsv',
       hover_data=['Coffee Type', 'Roast Type'], )

In [5]:
from plotly.express import histogram
histogram(data_frame=product_df, x='Profit', color='Size')

Large sizes drive profits; is that surprising? 

In [6]:
order_df = pd.read_excel(io=workbook, sheet_name='orders', usecols=['Order ID', 'Order Date', 'Customer ID', 'Product ID', 'Quantity'],
                  parse_dates=['Order Date'])
df = order_df.merge(right=product_df, on='Product ID', how='inner')
df['total_profit'] = df['Quantity'] * df['Profit']
df.head()

Unnamed: 0,Order ID,Order Date,Customer ID,Product ID,Quantity,Coffee Type,Roast Type,Size,Unit Price,Price per 100g,Profit,total_profit
0,QEV-37451-860,2019-09-05,17670-51384-MA,R-M-1,2,Rob,M,1.0,9.95,0.995,0.597,1.194
1,SZW-48378-399,2022-07-02,34136-36674-OM,R-M-1,5,Rob,M,1.0,9.95,0.995,0.597,2.985
2,OFX-99147-470,2021-11-24,49860-68865-AB,R-M-1,6,Rob,M,1.0,9.95,0.995,0.597,3.582
3,JHW-74554-805,2019-11-26,14103-58987-ZU,R-M-1,6,Rob,M,1.0,9.95,0.995,0.597,3.582
4,JDS-33440-914,2021-10-28,58511-10548-ZU,R-M-1,3,Rob,M,1.0,9.95,0.995,0.597,1.791


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Order ID        1000 non-null   object        
 1   Order Date      1000 non-null   datetime64[ns]
 2   Customer ID     1000 non-null   object        
 3   Product ID      1000 non-null   object        
 4   Quantity        1000 non-null   int64         
 5   Coffee Type     1000 non-null   object        
 6   Roast Type      1000 non-null   object        
 7   Size            1000 non-null   float64       
 8   Unit Price      1000 non-null   float64       
 9   Price per 100g  1000 non-null   float64       
 10  Profit          1000 non-null   float64       
 11  total_profit    1000 non-null   float64       
dtypes: datetime64[ns](1), float64(5), int64(1), object(5)
memory usage: 93.9+ KB


In [8]:
df.nunique()

Order ID          957
Order Date        689
Customer ID       913
Product ID         48
Quantity            6
Coffee Type         4
Roast Type          3
Size                4
Unit Price         40
Price per 100g     40
Profit             48
total_profit      236
dtype: int64

In [9]:
customer_df = pd.read_excel(io=workbook, sheet_name='customers', usecols=['Customer ID', 'Country', 'Loyalty Card'])
customer_df.head()

Unnamed: 0,Customer ID,Country,Loyalty Card
0,17670-51384-MA,United States,Yes
1,73342-18763-UW,Ireland,No
2,21125-22134-PX,United States,Yes
3,71253-00052-RN,United States,Yes
4,23806-46781-OU,Ireland,No


In [10]:
customer_profit_df = df[['Customer ID', 'total_profit']].groupby(by=['Customer ID']).sum().sort_values(ascending=False, by='total_profit').reset_index()
customer_profit_df = customer_profit_df.merge(right=customer_df, on='Customer ID', how='inner')
for color in ['Country', 'Loyalty Card']:
    histogram(data_frame=customer_profit_df, x='total_profit', color=color).show()

In [11]:
customer_profit_df['total_profit'].mean(), customer_profit_df['total_profit'].median()

(4.950950054764512, 3.0374999999999996)

Our total profit distribution skews right; we do more business with lower-profit customers (left of the mean), but we make more of our profit with our fewer, more eccentric higher-profit customers. And our loyalty cards are more prevalent with lower-profit customers. Hmmm.