In [17]:
# https://www.youtube.com/watch?v=i3uMhH2xeOM&ab_channel=Buynomics

In [18]:
# https://ngugijoan.medium.com/pricing-on-point-the-art-and-science-of-dynamic-pricing-dd543bf80f01
# https://ngugijoan.medium.com/dynamic-pricing-implementation-through-data-science-price-optimization-strategies-56adab4d3176
# https://levelup.gitconnected.com/calculating-individual-price-elasticity-for-products-9787e3b82875
# https://www.kaggle.com/code/arnabchaki/flight-fare-prediction-0-96-r2-score?fbclid=IwZXh0bgNhZW0CMTAAAR05L4by3xyhImYsDOnF-ufsQQ7VbBefv8Bg3ECHy1JHCR_XmjSZIAKM7yE_aem_AWdIsN4qMSlU9R0FQsAR9y8hT_e_ggs_tIfnGdUdpwA4mLwPAbLPidigOsMMcKNF-4wyLjSg2hcmqzefdb3gX5bT
# https://datascience.oneoffcoder.com/pricing-elasticity-modeling.html#Random-forest

In [19]:
import warnings
warnings.simplefilter("ignore")

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [21]:
from sklearn.tree import DecisionTreeRegressor

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
from sklearn.metrics import mean_squared_error

# 10. Load data

In [24]:
df = pd.read_csv("02. Labeled_retail_data.csv")

In [25]:
df.shape

(407629, 11)

# 11. Exploratory data analysis (EDA)

Count of Transactions by Description

In [26]:
category_counts = df['Category'].value_counts()
category_counts = category_counts.sort_values(ascending=False)

In [27]:
category_counts

Category
Home and Lifestyle      219974
Arts and Leisure         92162
Fashion and Travel       52586
Education and Office     24345
Health and Wellness      17759
Technology and More        803
Name: count, dtype: int64

Count of Transactions by Description

In [28]:
description_counts = df['Description'].value_counts()
description_counts = description_counts.sort_values(ascending=False)

In [29]:
description_counts

Description
WHITE HANGING HEART T-LIGHT HOLDER     3153
JUMBO BAG RED WHITE SPOTTY             1742
REGENCY CAKESTAND 3 TIER               1705
PACK OF 72 RETRO SPOT CAKE CASES       1586
STRAWBERRY CERAMIC TRINKET BOX         1407
                                       ... 
CHUNKY CRACKED GLAZE NECKLACE IVORY       1
GOLD CHRISTMAS STOCKING DECORATION        1
WHITE CHRISTMAS TREE 60CM                 1
S/16 BLACK SHINY/MAT BAUBLES              1
BAKING MOULD EASTER EGG MILK CHOC         1
Name: count, Length: 3979, dtype: int64

Count of Transactions by Invoice

In [30]:
invoice_counts = df['Invoice'].value_counts()
invoice_counts = invoice_counts.sort_values(ascending=False)

In [31]:
invoice_counts

Invoice
500356    270
511522    255
531382    251
507235    250
511051    248
         ... 
520316      1
520823      1
536568      1
534469      1
522060      1
Name: count, Length: 19178, dtype: int64

# 12. Feature engineering

In [32]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,profit_margin,Cost_price,Category
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,0.193525,5.605004,Home and Lifestyle
1,489559,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 12:55:00,6.95,17056.0,United Kingdom,0.400739,4.164866,Home and Lifestyle
2,489576,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,5,2009-12-01 13:38:00,7.95,15984.0,United Kingdom,0.06237,7.454161,Home and Lifestyle
3,489582,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,2,2009-12-01 13:47:00,7.95,14543.0,United Kingdom,0.448154,4.387179,Home and Lifestyle
4,489656,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 17:28:00,6.95,17428.0,United Kingdom,0.458394,3.764161,Home and Lifestyle


In [33]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [34]:
df['yyyymmdd'] = df['InvoiceDate'].dt.strftime('%Y%m%d')

datetime features

In [35]:
def datetime_feature_extraction(df):
    # Day extraction
    df['dayofweek'] = df['InvoiceDate'].dt.dayofweek
    df['dayofmonth'] = df['InvoiceDate'].dt.day
    df['dayofyear'] = df['InvoiceDate'].dt.dayofyear
    df['is_weekend'] = (df['InvoiceDate'].dt.dayofweek >= 5).astype(int)
    df['weekday_weekend'] = df['InvoiceDate'].dt.dayofweek.apply(lambda x: 0 if x >= 5 else 1)

    # Week extraction
    df['weekofyear'] = df['InvoiceDate'].dt.isocalendar().week
    df['weekofmonth'] = np.ceil(df['dayofmonth'] / 7).astype(int)    
    
    # Month extraction
    df['month'] = df['InvoiceDate'].dt.month
    df['quarter'] = df['InvoiceDate'].dt.quarter
    df['days_in_month'] = df['InvoiceDate'].dt.days_in_month

    # Year extraction
    df['year'] = df['InvoiceDate'].dt.year-2019
    return df

In [36]:
df = datetime_feature_extraction(df)

group by

In [37]:
aggregation_functions = {
    'Quantity': 'sum',
    'Price': 'mean',
    'Cost_price':'mean',
    'dayofweek': 'max',
    'dayofmonth': 'max',
    'dayofyear': 'max',
    'is_weekend': 'max',
    'weekday_weekend': 'max',
    'weekofyear': 'max',
    'weekofmonth': 'max',
    'month': 'max',
    'quarter': 'max',
    'days_in_month': 'max',
    'year': 'max'
}

In [38]:
df = df.groupby(['StockCode','Category','Description','yyyymmdd']).agg(aggregation_functions).reset_index()

Create change

In [39]:
df['change_qty'] = df['Quantity'].pct_change()

In [40]:
df['change_price'] = df['Price'].pct_change()

In [41]:
df.dropna(inplace=True)

In [42]:
df.sample(10)

Unnamed: 0,StockCode,Category,Description,yyyymmdd,Quantity,Price,Cost_price,dayofweek,dayofmonth,dayofyear,is_weekend,weekday_weekend,weekofyear,weekofmonth,month,quarter,days_in_month,year,change_qty,change_price
168818,47504K,Home and Lifestyle,ENGLISH ROSE GARDEN SECATEURS,20100713,1,1.95,1.382307,1,13,194,0,1,28,2,7,3,31,-9,0.0,0.0
207933,85035A,Home and Lifestyle,GARDENIA 3 WICK MORRIS BOXED CANDLE,20101007,5,4.25,2.722372,3,7,280,0,1,40,1,10,4,31,-9,-0.285714,0.0
81477,21923,Home and Lifestyle,RED/CREAM STRIPE FRINGE HAMMOCK,20100628,5,7.95,5.254492,0,28,179,0,1,26,4,6,2,30,-9,0.0,0.0
131597,22468,Home and Lifestyle,BABUSHKA LIGHTS STRING OF 10,20100927,3,6.75,4.013504,0,27,270,0,1,39,4,9,3,30,-9,-0.9375,0.134454
4608,17090A,Home and Lifestyle,LAVENDER INCENSE 40 CONES IN TIN,20101024,6,1.25,0.660168,6,24,297,1,0,42,4,10,4,31,-9,0.0,0.0
7081,20675,Home and Lifestyle,BLUE SPOTTY BOWL,20100729,11,1.25,0.926387,3,29,210,0,1,30,5,7,3,31,-9,-0.3125,0.0
150523,22666,Home and Lifestyle,RECIPE BOX PANTRY YELLOW DESIGN,20100915,15,2.95,1.961033,2,15,258,0,1,37,3,9,3,30,-9,-0.583333,0.047337
157496,22811,Home and Lifestyle,SET OF 6 T-LIGHTS CACTI,20101111,6,2.95,2.403044,3,11,315,0,1,45,2,11,4,30,-9,0.0,0.0
213824,85118,Home and Lifestyle,HEART T-LIGHT HOLDER,20100907,48,0.38,0.350074,1,7,250,0,1,36,1,9,3,30,-9,23.0,-0.696
1771,15056P,Fashion and Travel,EDWARDIAN PARASOL PINK,20100228,2,5.95,2.991742,6,28,59,1,0,8,4,2,1,28,-9,-0.666667,0.20202


# 13. Store current price and quantity

In [43]:
df.sort_values(by=['Category','Description', 'yyyymmdd'], ascending=[True,True, False], inplace=True)

In [44]:
df['row_number'] = df.groupby('Description').cumcount() + 1

In [45]:
df.head()

Unnamed: 0,StockCode,Category,Description,yyyymmdd,Quantity,Price,Cost_price,dayofweek,dayofmonth,dayofyear,...,weekday_weekend,weekofyear,weekofmonth,month,quarter,days_in_month,year,change_qty,change_price,row_number
164251,35962,Arts and Leisure,12 ASS ZINC CHRISTMAS DECORATIONS,20101020,11,2.1,1.747029,2,20,293,...,1,42,3,10,4,31,-9,1.2,0.0,1
164250,35962,Arts and Leisure,12 ASS ZINC CHRISTMAS DECORATIONS,20101010,5,2.1,1.878906,6,10,283,...,0,40,2,10,4,31,-9,1.5,0.0,2
164249,35962,Arts and Leisure,12 ASS ZINC CHRISTMAS DECORATIONS,20101007,2,2.1,1.229558,3,7,280,...,1,40,1,10,4,31,-9,-0.833333,0.0,3
164248,35962,Arts and Leisure,12 ASS ZINC CHRISTMAS DECORATIONS,20101006,12,2.1,1.068459,2,6,279,...,1,40,1,10,4,31,-9,0.0,0.0,4
164247,35962,Arts and Leisure,12 ASS ZINC CHRISTMAS DECORATIONS,20101005,12,2.1,1.289836,1,5,278,...,1,40,1,10,4,31,-9,1.0,0.0,5


lastest

In [46]:
df_lastest = df[df['row_number'] == 1].reset_index(drop=True)

In [47]:
df_lastest.shape

(3979, 21)

In [48]:
df_lastest.head()

Unnamed: 0,StockCode,Category,Description,yyyymmdd,Quantity,Price,Cost_price,dayofweek,dayofmonth,dayofyear,...,weekday_weekend,weekofyear,weekofmonth,month,quarter,days_in_month,year,change_qty,change_price,row_number
0,35962,Arts and Leisure,12 ASS ZINC CHRISTMAS DECORATIONS,20101020,11,2.1,1.747029,2,20,293,...,1,42,3,10,4,31,-9,1.2,0.0,1
1,22436,Arts and Leisure,12 COLOURED PARTY BALLOONS,20101209,20,0.65,0.329037,3,9,343,...,1,49,2,12,4,31,-9,0.666667,0.0,1
2,21440,Arts and Leisure,12 MINI TOADSTOOL PEGS,20100701,13,1.25,0.974084,3,1,182,...,1,26,1,7,3,31,-9,1.6,0.0,1
3,84465,Arts and Leisure,15 PINK FLUFFY CHICKS IN BOX,20100406,1,2.95,1.57527,1,6,96,...,1,14,1,4,2,30,-9,0.0,0.0,1
4,21458,Arts and Leisure,2 PICTURE BOOK EGGS EASTER BUNNY,20100827,12,1.25,0.749582,4,27,239,...,1,34,4,8,3,31,-9,11.0,0.0,1


# 6. Model

split df by category

In [49]:
df['Category'].unique()

array(['Arts and Leisure', 'Education and Office', 'Fashion and Travel',
       'Health and Wellness', 'Home and Lifestyle', 'Technology and More'],
      dtype=object)

In [50]:
df_art_leisure = df[df['Category'] == 'Arts and Leisure']

In [51]:
df_education_office = df[df['Category'] == 'Education and Office']

In [52]:
df_fashion_travel = df[df['Category'] == 'Fashion and Travel']

In [53]:
df_health_wellness = df[df['Category'] == 'Health and Wellness']

In [54]:
df_home_lifestyle = df[df['Category'] == 'Home and Lifestyle']

In [55]:
df_tech = df[df['Category'] == 'Technology and More']

StockCode	Category	Description	yyyymmdd	Quantity	Price	Cost_pricesplit X and y

In [56]:
X_art_leisure = df_art_leisure.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_art_leisure = df_art_leisure['change_qty']

In [57]:
print(X_art_leisure.shape, y_art_leisure.shape)

(51303, 12) (51303,)


In [58]:
X_education_office = df_education_office.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_education_office = df_education_office['change_qty']

In [59]:
print(X_education_office.shape, y_education_office.shape)

(15537, 12) (15537,)


In [60]:
X_fashion_travel = df_fashion_travel.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_fashion_travel = df_fashion_travel['change_qty']

In [61]:
print(X_fashion_travel.shape, y_fashion_travel.shape)

(27787, 12) (27787,)


In [62]:
X_health_wellness = df_health_wellness.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_health_wellness = df_health_wellness['change_qty']

In [63]:
print(X_health_wellness.shape, y_health_wellness.shape)

(9023, 12) (9023,)


In [64]:
X_home_lifestyle = df_home_lifestyle.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_home_lifestyle = df_home_lifestyle['change_qty']

In [65]:
print(X_home_lifestyle.shape, y_home_lifestyle.shape)

(119297, 12) (119297,)


In [66]:
X_tech = df_tech.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_tech = df_tech['change_qty']

In [67]:
print(X_tech.shape, y_tech.shape)

(625, 12) (625,)


model training (art and leisure)

In [68]:
rt = DecisionTreeRegressor(random_state=42)

In [69]:
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [3, 5, 10]
}

In [70]:
grid_search = GridSearchCV(estimator=rt, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [71]:
grid_search.fit(X_art_leisure, y_art_leisure)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [72]:
grid_search.best_score_

-2526.668961303392

In [73]:
best_model_art_leisure = grid_search.best_estimator_

model training (education and office)

In [74]:
rt = DecisionTreeRegressor(random_state=42)

In [75]:
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [3, 5, 10]
}

In [76]:
grid_search = GridSearchCV(estimator=rt, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [77]:
grid_search.fit(X_education_office, y_education_office)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [78]:
grid_search.best_score_

-5340.853958363729

In [79]:
best_model_education_office = grid_search.best_estimator_

model training (fashion and travel)

In [80]:
rt = DecisionTreeRegressor(random_state=42)

In [81]:
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [3, 5, 10]
}

In [82]:
grid_search = GridSearchCV(estimator=rt, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [83]:
grid_search.fit(X_fashion_travel, y_fashion_travel)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [84]:
grid_search.best_score_

-275.82518333411514

In [85]:
best_model_fashion_travel = grid_search.best_estimator_

model training (health and wellness)

In [86]:
rt = DecisionTreeRegressor(random_state=42)

In [87]:
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [3, 5, 10]
}

In [88]:
grid_search = GridSearchCV(estimator=rt, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [89]:
grid_search.fit(X_health_wellness, y_health_wellness)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [90]:
grid_search.best_score_

-2542.514179001852

In [91]:
best_model_health_wellness = grid_search.best_estimator_

model training (home and lifestyle)

In [92]:
rt = DecisionTreeRegressor(random_state=42)

In [93]:
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [3, 5, 10]
}

In [94]:
grid_search = GridSearchCV(estimator=rt, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [95]:
grid_search.fit(X_home_lifestyle, y_home_lifestyle)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [96]:
grid_search.best_score_

-650.5956692248001

In [97]:
best_model_home_lifestyle = grid_search.best_estimator_

model training (tech)

In [98]:
rt = DecisionTreeRegressor(random_state=42)

In [99]:
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [3, 5, 10]
}

In [100]:
grid_search = GridSearchCV(estimator=rt, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [101]:
grid_search.fit(X_tech, y_tech)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [102]:
grid_search.best_score_

-72.30354601035509

In [103]:
best_model_tech = grid_search.best_estimator_

dict all model

In [106]:
model_dict = {
    'Arts and Leisure': best_model_art_leisure,
    'Education and Office': best_model_education_office,
    'Fashion and Travel': best_model_fashion_travel,
    'Health and Wellness': best_model_health_wellness,
    'Home and Lifestyle': best_model_home_lifestyle,
    'Technology and More': best_model_tech,
}

# 7. Optimize

In [107]:
df_final = pd.DataFrame()

In [108]:
for index, row in df_lastest.iterrows():
    
    df_optimize = df_lastest[index:index+1][['dayofweek','dayofmonth','dayofyear','is_weekend',
                                        'weekday_weekend','weekofyear','weekofmonth','month',
                                        'quarter','days_in_month','year']]
    multipliers = [round(x * 0.01, 2) for x in range(-50, 51)]
    df_optimize = df_optimize.loc[np.repeat(df_optimize.index.values, len(multipliers))]
    df_optimize['change_price'] = multipliers

    change_price_list = []
    for i in range(df_optimize.shape[0]):
        if row['Category'] == 'Arts and Leisure':
            demand_change = best_model_art_leisure.predict(df_optimize.iloc[[i]])
        elif row['Category'] == 'Education and Office':
            demand_change = best_model_education_office.predict(df_optimize.iloc[[i]])
        elif row['Category'] == 'Fashion and Travel':
            demand_change = best_model_fashion_travel.predict(df_optimize.iloc[[i]])
        elif row['Category'] == 'Health and Wellness':
            demand_change = best_model_health_wellness.predict(df_optimize.iloc[[i]])
        elif row['Category'] == 'Home and Lifestyle':
            demand_change = best_model_home_lifestyle.predict(df_optimize.iloc[[i]])
        elif row['Category'] == 'Technology and More':
            demand_change = best_model_tech.predict(df_optimize.iloc[[i]])        

        change_price_list.append(demand_change[0])
        
    df_optimize['change_qty'] = change_price_list

    initial_price = df_lastest[df_lastest['StockCode'] == row['StockCode']]['Price'].iloc[0]
    initial_cost_price = df_lastest[df_lastest['StockCode'] == row['StockCode']]['Cost_price'].iloc[0]
    initial_quantity = df_lastest[df_lastest['StockCode'] == row['StockCode']]['Quantity'].iloc[0]

    df_optimize['price_new'] = initial_price*(1+df_optimize['change_price'])
    df_optimize['qty_new'] = initial_quantity*(1+df_optimize['change_qty'])
    df_optimize['additional_profit'] = df_optimize['qty_new']*(df_optimize['price_new']-initial_cost_price)
    df_optimize = df_optimize[df_optimize['additional_profit'] == df_optimize['additional_profit'].max()]
    
    df_final_append = df_lastest[index:index+1][['StockCode','Category','Description','yyyymmdd','Quantity','Price']]
    df_final_append['change_price'] = df_optimize['change_price'].values[0]
    df_final_append['change_qty'] = df_optimize['change_qty'].values[0]
    df_final_append['price_new'] = df_optimize['price_new'].values[0]
    df_final_append['qty_new'] = df_optimize['qty_new'].values[0]
    df_final_append['additional_profit'] = df_optimize['additional_profit'].values[0]

    df_final = pd.concat([df_final, df_final_append], ignore_index=True)

In [109]:
df_final

Unnamed: 0,StockCode,Category,Description,yyyymmdd,Quantity,Price,change_price,change_qty,price_new,qty_new,additional_profit
0,35962,Arts and Leisure,12 ASS ZINC CHRISTMAS DECORATIONS,20101020,11,2.10,-0.03,18.821106,2.0370,218.032167,63.222961
1,22436,Arts and Leisure,12 COLOURED PARTY BALLOONS,20101209,20,0.65,-0.03,18.821106,0.6305,396.422122,119.506728
2,21440,Arts and Leisure,12 MINI TOADSTOOL PEGS,20100701,13,1.25,-0.03,18.821106,1.2125,257.674380,61.433668
3,84465,Arts and Leisure,15 PINK FLUFFY CHICKS IN BOX,20100406,1,2.95,-0.03,18.821106,2.8615,19.821106,25.494511
4,21458,Arts and Leisure,2 PICTURE BOOK EGGS EASTER BUNNY,20100827,12,1.25,-0.03,18.821106,1.2125,237.853273,110.106671
...,...,...,...,...,...,...,...,...,...,...,...
3974,84086C,Technology and More,PINK/PURPLE RETRO RADIO,20101125,6,2.95,-0.05,27.766667,2.8025,172.600000,49.366007
3975,22486,Technology and More,PLASMATRONIC LAMP,20101206,2,12.75,-0.05,27.766667,12.1125,57.533333,114.572906
3976,79151B,Technology and More,SILICON CUBE 25W BLUE,20101205,1,0.42,-0.05,27.766667,0.3990,28.766667,4.950716
3977,79157B,Technology and More,UBO-LIGHT TRIOBASE BLUE,20100207,1,0.85,0.50,0.985986,1.2750,1.985986,1.001151
