## Tabular data lab

In this session we're going to practice what we've learned about `pandas` to analyze a dataset of real life sales of a company in an international ecommerce company.

We'll start by importing the necessary libraries and loading the dataset.

In [1]:
import pandas as pd
import numpy as np

ecomm = pd.read_csv("ecomm.csv")
ecomm.head()

Unnamed: 0,product_id,reporting_date,country_code,units,price,sales,cogs,discounts,refunds,advertising_fees,returns,sde,fees
0,17,2022-01-01,us,48,31.429792,1508.63,-254.48,-121.2,-24.36,-305.42,3.0,385.53,-428.24
1,3,2022-01-01,de,1,46.21,46.21,-16.12,0.0,,-2.05,,15.79,-12.25
2,32,2022-01-01,de,39,21.330513,831.89,-178.62,-3.58,,-79.76,,269.24,-300.69
3,9,2022-01-01,de,6,8.048333,48.29,-8.82,-2.51,,-12.27,,4.89,-19.8
4,5,2022-01-01,nl,1,17.15,17.15,-5.47,-0.52,,-0.44,,4.91,-5.81


Columns description:

- `product_id`: Unique identifier for the product
- `reporting_date`: Date of the sale
- `country_code`: Country of the sale
- `units`: Number of units sold
- `price`: Average price of each unit sold
- `sales`: Total sales in EUR
- `cogs`: Total cost of goods sold in EUR
- `discounts`: Total discounts in EUR
- `refunds`: Total refunds in EUR
- `advertising_fees`: Total advertising fees in EUR
- `returns`: Total returns in EUR
- `sde`: Total Seller's Discretionary Earnings (profit) in EUR
- `fees`: Total fees in EUR

### 1. Convert the `date` column to a datetime object

We can use the `pd.to_datetime` function to convert the `date` column to a datetime object.

In [6]:
ecomm["reporting_date"].dtype

dtype('O')

In [7]:
type(ecomm["reporting_date"].values[0])

str

In [8]:
ecomm["reporting_date"] = pd.to_datetime(ecomm["reporting_date"])

ecomm["reporting_date"].dtype

dtype('<M8[ns]')

In [9]:
type(ecomm["reporting_date"].values[0])

numpy.datetime64

### 2. Out of the new `reporting_date` column, create a the following columns

- `year`
- `quarter`
- `month`
- `day`
- `date`
- `dayofweek`
- `dayofyear`
- `weekofyear`

In [10]:
ecomm["year"] = ecomm["reporting_date"].dt.year
ecomm["quarter"] = ecomm["reporting_date"].dt.quarter
ecomm["month"] = ecomm["reporting_date"].dt.month
ecomm["day"] = ecomm["reporting_date"].dt.day
ecomm["date"] = ecomm["reporting_date"].dt.date
ecomm["dayofweek"] = ecomm["reporting_date"].dt.dayofweek
ecomm["dayofyear"] = ecomm["reporting_date"].dt.dayofyear
ecomm["weekofyear"] = ecomm["reporting_date"].dt.isocalendar().week

ecomm


Unnamed: 0,product_id,reporting_date,country_code,units,price,sales,cogs,discounts,refunds,advertising_fees,...,sde,fees,year,quarter,month,day,date,dayofweek,dayofyear,weekofyear
0,17,2022-01-01,us,48,31.429792,1508.63,-254.48,-121.20,-24.36,-305.42,...,385.53,-428.24,2022,1,1,1,2022-01-01,5,1,52
1,3,2022-01-01,de,1,46.210000,46.21,-16.12,0.00,,-2.05,...,15.79,-12.25,2022,1,1,1,2022-01-01,5,1,52
2,32,2022-01-01,de,39,21.330513,831.89,-178.62,-3.58,,-79.76,...,269.24,-300.69,2022,1,1,1,2022-01-01,5,1,52
3,9,2022-01-01,de,6,8.048333,48.29,-8.82,-2.51,,-12.27,...,4.89,-19.80,2022,1,1,1,2022-01-01,5,1,52
4,5,2022-01-01,nl,1,17.150000,17.15,-5.47,-0.52,,-0.44,...,4.91,-5.81,2022,1,1,1,2022-01-01,5,1,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20594,32,2022-11-15,de,1,23.520000,23.52,-4.58,0.00,,,...,8.58,-10.36,2022,4,11,15,2022-11-15,1,319,46
20595,24,2022-11-15,de,1,33.610000,33.61,-11.07,0.00,,,...,13.35,-9.19,2022,4,11,15,2022-11-15,1,319,46
20596,9,2022-11-15,de,1,8.390000,8.39,-1.56,0.00,,,...,3.07,-3.76,2022,4,11,15,2022-11-15,1,319,46
20597,34,2022-11-15,fr,1,8.330000,8.33,-2.09,0.00,,,...,2.94,-3.30,2022,4,11,15,2022-11-15,1,319,46


### 3. Using `lambda` with `map` to create new columns

We can use the `map` function to apply a function to each element of a column.


In [11]:
# create column with the name of the month

dict_month_name = {1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December"}

ecomm["month_name"] = ecomm["month"].map(lambda m: dict_month_name[m])

ecomm.sample(5)

Unnamed: 0,product_id,reporting_date,country_code,units,price,sales,cogs,discounts,refunds,advertising_fees,...,fees,year,quarter,month,day,date,dayofweek,dayofyear,weekofyear,month_name
13345,32,2022-08-01,de,26,23.640385,614.65,-119.08,-10.4,-21.73,-61.33,...,-217.75,2022,3,8,1,2022-08-01,0,213,31,August
456,4,2022-01-09,de,18,21.195556,381.52,-99.54,-2.03,-23.39,-70.73,...,-89.1,2022,1,1,9,2022-01-09,6,9,1,January
9791,8,2022-06-12,es,1,16.52,16.52,-5.57,0.0,,-12.14,...,-5.58,2022,2,6,12,2022-06-12,6,163,23,June
8566,22,2022-05-26,it,4,24.99,99.96,-18.68,-1.64,,-25.99,...,-38.2,2022,2,5,26,2022-05-26,3,146,21,May
7181,22,2022-05-04,it,1,28.68,28.68,-4.67,0.0,,-13.05,...,-10.13,2022,2,5,4,2022-05-04,2,124,18,May


### 4. Answer the following questions

- Total sales per year and product_id
- Mean sales per quarter and country
- Minimum advertising spend per week, country and product_id
- Best selling product in US and Germany in May 2022
- Which country had the lowest ratio of advertising spend to sales in Q1 2022
- Which product is the one with the highest ratio of advertising spend to sales in Q1 2022
- Which product and country had the best sde to sales ratio in H1 2022
- Which day of the week is the best for sales in the US
- Average discount in the 4 dates with more discount, vs average discount in 2022
- Which are the products that represent 80% of the total sales in 2022
- Which product has the worst/best cogs to price ratio?


In [14]:
ecomm.groupby(["year","product_id"])["sales"].count().reset_index()

Unnamed: 0,year,product_id,sales
0,2022,0,523
1,2022,1,469
2,2022,2,555
3,2022,3,572
4,2022,4,683
5,2022,5,576
6,2022,6,342
7,2022,7,395
8,2022,8,1091
9,2022,9,408


In [17]:
(
    ecomm.groupby(["weekofyear","country_code","product_id"])
["advertising_fees"].min()
)

weekofyear  country_code  product_id
1           de            0             -18.32
                          1             -13.46
                          2             -12.22
                          3             -19.65
                          4             -74.03
                                         ...  
52          us            21           -108.17
                          22            -65.70
                          26           -301.12
                          30             -6.36
                          34            -38.18
Name: advertising_fees, Length: 4834, dtype: float64

In [35]:
filtered_ecomm = ecomm[
    (ecomm["year"]==2022)
    & (ecomm["month_name"]=="May")
    & (ecomm["country_code"].isin(["us","de"]))
]
(filtered_ecomm.groupby("product_id")["units"].sum()).max()

4725

In [36]:
ratio_ecomm = ecomm[(ecomm["year"]==2022)
                    & (ecomm["quarter"]==1)]
df_by_country = ratio_ecomm.groupby(["country_code"])[["advertising_fees","sales"]].sum()
df_by_country["ratio_ad_serie"] = (df_by_country["advertising_fees"].abs()/df_by_country["sales"])
df_by_country

Unnamed: 0_level_0,advertising_fees,sales,ratio_ad_serie
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
de,-98793.46,815486.98,0.121147
es,-587.44,7294.57,0.080531
fr,-2451.32,24623.87,0.099551
it,-2927.41,20842.01,0.140457
nl,-62.37,4630.24,0.01347
pl,0.0,428.57,0.0
se,-4.6,774.11,0.005942
tr,0.0,25.75,0.0
us,-44499.36,413005.91,0.107745


In [66]:
ratio_ecomm = ecomm[(ecomm["year"]==2022)
                    & (ecomm["quarter"]==1)]
df_by_product = ratio_ecomm.groupby(["product_id"])[["advertising_fees","sales"]].sum()
df_by_product["ratio_ad_serie"] = (df_by_country["advertising_fees"].abs()/df_by_country["sales"])
df_by_product

Unnamed: 0_level_0,advertising_fees,sales,ratio_ad_serie
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-1177.99,20555.19,0.057309
1,-1017.74,18537.35,0.054902
2,-817.37,14780.68,0.0553
3,-1655.04,28456.06,0.058161
4,-2913.22,21226.25,0.137246
5,-2419.2,16268.81,0.148702
6,-1010.0,4415.63,0.228733
7,-2733.32,11834.91,0.230954
8,-4379.87,20375.39,0.214959
9,-3834.29,14890.12,0.257506


In [86]:
ratio_ecomm = ecomm[(ecomm["year"]==2022)]
sortedlist = ratio_ecomm.sort_values(by=['discounts'],ascending=True)      
newdf =sortedlist['discounts']
total = newdf.iloc[0:4].mean()
print(total)
df2 = newdf.mean()
print(df2)


-1087.275
-6.829024224476952


In [None]:
ratio_ecomm = ecomm[(ecomm["year"]==2022)
                    & (ecomm["quarter"]==1)]
df_by_product = ratio_ecomm.groupby(["product_id"])[["advertising_fees","sales"]].sum()
df_by_product["ratio_ad_serie"] = (df_by_country["advertising_fees"].abs()/df_by_country["sales"])
df_by_product

In [2]:
units_sum = ecomm.groupby(["product_id"])["units"].sum().reset_index()
units_sum.head()
units_sum_sorted = units_sum.sort_values(by='units',ascending=False)
units_sum_sorted["cumulative"]= (units_sum_sorted["units"].cumsum())
units_sum_sorted
sum_units = units_sum_sorted['units'].sum()
sum_units
units_sum_sorted["percentage"]= ((units_sum_sorted["cumulative"]/sum_units)*100)
units_sum_sorted=units_sum_sorted.reset_index()
units_sum_sorted.head(16)

Unnamed: 0,index,product_id,units,cumulative,percentage
0,18,18,33814,33814,15.227142
1,34,34,18145,51959,23.39821
2,17,17,17731,69690,31.382845
3,26,26,13830,83520,37.610779
4,24,24,13360,96880,43.627062
5,32,32,13089,109969,49.521309
6,22,22,9830,119799,53.947961
7,20,20,9774,129573,58.349395
8,4,4,8333,137906,62.101917
9,25,25,7599,145505,65.523903
