In [2]:
# https://www.youtube.com/watch?v=i3uMhH2xeOM&ab_channel=Buynomics

In [3]:
# https://ngugijoan.medium.com/pricing-on-point-the-art-and-science-of-dynamic-pricing-dd543bf80f01
# https://ngugijoan.medium.com/dynamic-pricing-implementation-through-data-science-price-optimization-strategies-56adab4d3176
# https://levelup.gitconnected.com/calculating-individual-price-elasticity-for-products-9787e3b82875
# https://www.kaggle.com/code/arnabchaki/flight-fare-prediction-0-96-r2-score?fbclid=IwZXh0bgNhZW0CMTAAAR05L4by3xyhImYsDOnF-ufsQQ7VbBefv8Bg3ECHy1JHCR_XmjSZIAKM7yE_aem_AWdIsN4qMSlU9R0FQsAR9y8hT_e_ggs_tIfnGdUdpwA4mLwPAbLPidigOsMMcKNF-4wyLjSg2hcmqzefdb3gX5bT
# https://datascience.oneoffcoder.com/pricing-elasticity-modeling.html#Random-forest

In [4]:
import warnings
warnings.simplefilter("ignore")

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [6]:
import duckdb

In [7]:
from sklearn.tree import DecisionTreeRegressor

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
from sklearn.metrics import make_scorer, mean_squared_error

# 10. Load data

In [10]:
df = pd.read_csv("online_retail_II_cleaned_with_cost_and_cat.csv")

In [11]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,profit_margin,Cost_price,Category
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,0.193525,5.605004,Home and Garden
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.47585,3.538012,Home and Garden
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,0.368677,4.26143,Home and Garden
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,0.303343,1.46298,Arts and Crafts
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,0.086449,1.141939,Arts and Crafts


In [12]:
df.shape

(397432, 11)

# 11. Feature engineering

In [13]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [14]:
df['yyyymmdd'] = df['InvoiceDate'].dt.strftime('%Y%m%d')

datetime features

In [15]:
def datetime_feature_extraction(df):
    # Day extraction
    df['dayofweek'] = df['InvoiceDate'].dt.dayofweek
    df['dayofmonth'] = df['InvoiceDate'].dt.day
    df['dayofyear'] = df['InvoiceDate'].dt.dayofyear
    df['is_weekend'] = (df['InvoiceDate'].dt.dayofweek >= 5).astype(int)
    df['weekday_weekend'] = df['InvoiceDate'].dt.dayofweek.apply(lambda x: 0 if x >= 5 else 1)

    # Week extraction
    df['weekofyear'] = df['InvoiceDate'].dt.isocalendar().week
    df['weekofmonth'] = np.ceil(df['dayofmonth'] / 7).astype(int)    
    
    # Month extraction
    df['month'] = df['InvoiceDate'].dt.month
    df['quarter'] = df['InvoiceDate'].dt.quarter
    df['days_in_month'] = df['InvoiceDate'].dt.days_in_month

    # Year extraction
    df['year'] = 2019 - df['InvoiceDate'].dt.year
    return df

In [16]:
df = datetime_feature_extraction(df)

group by

In [17]:
aggregation_functions = {
    'Quantity': 'sum',
    'Price': 'mean',
    'Cost_price':'mean',
    'dayofweek': 'max',
    'dayofmonth': 'max',
    'dayofyear': 'max',
    'is_weekend': 'max',
    'weekday_weekend': 'max',
    'weekofyear': 'max',
    'weekofmonth': 'max',
    'month': 'max',
    'quarter': 'max',
    'days_in_month': 'max',
    'year': 'max'
}

In [18]:
df = df.groupby(['StockCode','Category','Description','yyyymmdd']).agg(aggregation_functions).reset_index()

create change

In [19]:
df['change_qty'] = df['Quantity'].pct_change()

In [20]:
df['change_price'] = df['Price'].pct_change()

In [21]:
df.dropna(inplace=True)

In [22]:
df.sample(10)

Unnamed: 0,StockCode,Category,Description,yyyymmdd,Quantity,Price,Cost_price,dayofweek,dayofmonth,dayofyear,is_weekend,weekday_weekend,weekofyear,weekofmonth,month,quarter,days_in_month,year,change_qty,change_price
116853,22357,Food and Beverages,KINGS CHOICE BISCUIT TIN,20100413,7,4.25,3.129313,1,13,103,0,1,15,2,4,2,30,9,-0.766667,0.049383
79540,21931,Home and Garden,JUMBO STORAGE BAG SUKI,20100506,34,1.95,1.593603,3,6,126,0,1,18,1,5,2,31,9,-0.701754,0.04
137777,22566,Clothing and Accessories,FELTCRAFT HAIRBAND S/2 PINK/PURPLE,20100912,2,0.85,0.776379,6,12,255,1,0,36,2,9,3,30,9,-0.833333,0.0
113224,22326,Kitchen and Dining,ROUND SNACK BOXES SET4 WOODLAND,20100817,6,2.95,2.100246,1,17,229,0,1,33,3,8,3,31,9,0.0,0.0
116470,22355,Clothing and Accessories,CHARLOTTE BAGSUKI DESIGN,20100910,36,0.85,0.634109,4,10,253,0,1,36,2,9,3,30,9,-0.076923,0.0
21426,21001,Clothing and Accessories,ROSE DU SUD WASHBAG,20100228,1,5.95,3.801171,6,28,59,1,0,8,4,2,1,28,9,0.0,0.0
174611,72807B,Health and Beauty,SET/3 OCEAN SCENT CANDLE JEWEL BOX,20100128,4,4.25,4.161204,3,28,28,0,1,4,4,1,1,31,9,0.333333,0.0
12266,20734,Tools and Home Improvement,SILVER MINI TAPE MEASURE,20100913,250,0.72,0.419397,0,13,256,0,1,37,2,9,3,30,9,249.0,-0.152941
24289,21077,Kitchen and Dining,SET/20 WOODLAND PAPER NAPKINS,20100421,120,0.745,0.614004,2,21,111,0,1,16,3,4,2,30,9,9.0,-0.123529
31565,21181,Home and Garden,PLEASE ONE PERSONMETAL SIGN,20101115,108,2.06875,1.494757,0,15,319,0,1,46,3,11,4,30,9,5.352941,-0.014881


store dataframe for optimize

In [23]:
df.sort_values(by=['Category','Description', 'yyyymmdd'], ascending=[True,True, False], inplace=True)

In [24]:
df['row_number'] = df.groupby('Description').cumcount() + 1

In [25]:
df_lastest = df[df['row_number'] == 1].reset_index(drop=True)

In [26]:
df_lastest.shape

(3888, 21)

In [27]:
df_lastest.head()

Unnamed: 0,StockCode,Category,Description,yyyymmdd,Quantity,Price,Cost_price,dayofweek,dayofmonth,dayofyear,...,weekday_weekend,weekofyear,weekofmonth,month,quarter,days_in_month,year,change_qty,change_price,row_number
0,22282,Arts and Crafts,12 EGG HOUSE PAINTED WOOD,20101129,2,12.75,6.97871,0,29,333,...,1,48,5,11,4,30,9,-0.875,0.164384,1
1,21447,Arts and Crafts,12 IVORY ROSE PEG PLACE SETTINGS,20101206,12,1.25,1.214665,0,6,340,...,1,49,1,12,4,31,9,0.714286,0.0,1
2,21440,Arts and Crafts,12 MINI TOADSTOOL PEGS,20100701,13,1.25,0.974084,3,1,182,...,1,26,1,7,3,31,9,1.6,0.0,1
3,20976,Arts and Crafts,12 PENCILS SMALL TUBE POSY,20100813,8,0.65,0.607085,4,13,225,...,1,32,2,8,3,31,9,7.0,0.0,1
4,20974,Arts and Crafts,12 PENCILS SMALL TUBE SKULL,20101209,48,0.65,0.489724,3,9,343,...,1,49,2,12,4,31,9,1.086957,0.0,1


In [28]:
df.columns

Index(['StockCode', 'Category', 'Description', 'yyyymmdd', 'Quantity', 'Price',
       'Cost_price', 'dayofweek', 'dayofmonth', 'dayofyear', 'is_weekend',
       'weekday_weekend', 'weekofyear', 'weekofmonth', 'month', 'quarter',
       'days_in_month', 'year', 'change_qty', 'change_price', 'row_number'],
      dtype='object')

EDA

In [46]:
duckdb.query("""
select Category, count(*), min(change_qty), max(change_qty)
, min(change_price), max(change_price)
from df
group by Category
""")

┌────────────────────────────┬──────────────┬─────────────────────┬─────────────────┬─────────────────────┬────────────────────┐
│          Category          │ count_star() │   min(change_qty)   │ max(change_qty) │  min(change_price)  │ max(change_price)  │
│          varchar           │    int64     │       double        │     double      │       double        │       double       │
├────────────────────────────┼──────────────┼─────────────────────┼─────────────────┼─────────────────────┼────────────────────┤
│ Office Supplies            │          604 │ -0.9976851851851852 │           287.0 │             -0.8656 │   9.92436974789916 │
│ Tools and Home Improvement │         1864 │              -0.999 │           499.0 │ -0.9333333333333333 │                5.5 │
│ Food and Beverages         │         4101 │ -0.9992857142857143 │           209.5 │ -0.8584905660377359 │ 10.052631578947368 │
│ Kitchen and Dining         │        42844 │  -0.999857305936073 │          3191.0 │            

# 6. Model

split df by category

In [30]:
sorted(df['Category'].unique())

['Arts and Crafts',
 'Automotive',
 'Books and Stationery',
 'Clothing and Accessories',
 'Crafts and Hobbies',
 'Electronics',
 'Food and Beverages',
 'Health and Beauty',
 'Home and Garden',
 'Kitchen and Dining',
 'Office Supplies',
 'Pet',
 'Sports and Outdoors',
 'Tools and Home Improvement',
 'Toys and Games']

In [31]:
df_art_crafts = df[df['Category'] == 'Arts and Crafts']
df_automotive = df[df['Category'] == 'Automotive']
df_book_stationary = df[df['Category'] == 'Books and Stationery']
df_clothing_acc= df[df['Category'] == 'Clothing and Accessories']
df_crafts_hobbies = df[df['Category'] == 'Crafts and Hobbies']
df_electronics = df[df['Category'] == 'Electronics']
df_food_beverages = df[df['Category'] == 'Food and Beverages']
df_health_beauty = df[df['Category'] == 'Health and Beauty']
df_home_garden = df[df['Category'] == 'Home and Garden']
df_kitchen_dining = df[df['Category'] == 'Kitchen and Dining']
df_office_supplies = df[df['Category'] == 'Office Supplies']
df_pet = df[df['Category'] == 'Pet']
df_sports_outdoors = df[df['Category'] == 'Sports and Outdoors']
df_tools_home_improvement = df[df['Category'] == 'Tools and Home Improvement']
df_toys_games = df[df['Category'] == 'Toys and Games']

In [32]:
X_art_crafts = df_art_crafts.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_art_crafts = df_art_crafts['change_qty']

X_automotive = df_automotive.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_automotive = df_automotive['change_qty']

X_book_stationary = df_book_stationary.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_book_stationary = df_book_stationary['change_qty']

X_clothing_acc = df_clothing_acc.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_clothing_acc = df_clothing_acc['change_qty']

X_crafts_hobbies = df_crafts_hobbies.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_crafts_hobbies = df_crafts_hobbies['change_qty']

X_electronics = df_electronics.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_electronics = df_electronics['change_qty']

X_food_beverages = df_food_beverages.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_food_beverages = df_food_beverages['change_qty']

X_health_beauty = df_health_beauty.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_health_beauty = df_health_beauty['change_qty']

X_home_garden = df_home_garden.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_home_garden = df_home_garden['change_qty']

X_kitchen_dining = df_kitchen_dining.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_kitchen_dining = df_kitchen_dining['change_qty']

X_office_supplies = df_office_supplies.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_office_supplies = df_office_supplies['change_qty']

X_pet = df_pet.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_pet = df_pet['change_qty']

X_sports_outdoors = df_sports_outdoors.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_sports_outdoors = df_sports_outdoors['change_qty']

X_tools_home_improvement = df_tools_home_improvement.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_tools_home_improvement = df_tools_home_improvement['change_qty']

X_toys_games = df_toys_games.drop(columns=['StockCode','Category','Description','yyyymmdd','Quantity','Price','Cost_price','row_number','change_qty'])
y_toys_games = df_toys_games['change_qty']

In [34]:
**

SyntaxError: invalid syntax (402238255.py, line 1)

train

In [None]:
rt = DecisionTreeRegressor(random_state=42)

In [None]:
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [3, 5, 10]
}

In [None]:
grid_search = GridSearchCV(estimator=rt, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)

In [None]:
print("Training model for category: art_crafts")
grid_search.fit(X_art_crafts, y_art_crafts)
best_model_art_crafts = grid_search.best_estimator_
print(f"Best parameters for art_crafts: {grid_search.best_params_}")
print(f"Best score for art_crafts: {-grid_search.best_score_}")

print("Training model for category: automotive")
grid_search.fit(X_automotive, y_automotive)
best_model_automotive = grid_search.best_estimator_
print(f"Best parameters for automotive: {grid_search.best_params_}")
print(f"Best score for automotive: {-grid_search.best_score_}")

print("Training model for category: book_stationary")
grid_search.fit(X_book_stationary, y_book_stationary)
best_model_book_stationary = grid_search.best_estimator_
print(f"Best parameters for book_stationary: {grid_search.best_params_}")
print(f"Best score for book_stationary: {-grid_search.best_score_}")

print("Training model for category: clothing_acc")
grid_search.fit(X_clothing_acc, y_clothing_acc)
best_model_clothing_acc = grid_search.best_estimator_
print(f"Best parameters for clothing_acc: {grid_search.best_params_}")
print(f"Best score for clothing_acc: {-grid_search.best_score_}")

print("Training model for category: crafts_hobbies")
grid_search.fit(X_crafts_hobbies, y_crafts_hobbies)
best_model_crafts_hobbies = grid_search.best_estimator_
print(f"Best parameters for crafts_hobbies: {grid_search.best_params_}")
print(f"Best score for crafts_hobbies: {-grid_search.best_score_}")

print("Training model for category: electronics")
grid_search.fit(X_electronics, y_electronics)
best_model_electronics = grid_search.best_estimator_
print(f"Best parameters for electronics: {grid_search.best_params_}")
print(f"Best score for electronics: {-grid_search.best_score_}")

print("Training model for category: food_beverages")
grid_search.fit(X_food_beverages, y_food_beverages)
best_model_food_beverages = grid_search.best_estimator_
print(f"Best parameters for food_beverages: {grid_search.best_params_}")
print(f"Best score for food_beverages: {-grid_search.best_score_}")

print("Training model for category: health_beauty")
grid_search.fit(X_health_beauty, y_health_beauty)
best_model_health_beauty = grid_search.best_estimator_
print(f"Best parameters for health_beauty: {grid_search.best_params_}")
print(f"Best score for health_beauty: {-grid_search.best_score_}")

print("Training model for category: home_garden")
grid_search.fit(X_home_garden, y_home_garden)
best_model_home_garden = grid_search.best_estimator_
print(f"Best parameters for home_garden: {grid_search.best_params_}")
print(f"Best score for home_garden: {-grid_search.best_score_}")

print("Training model for category: kitchen_dining")
grid_search.fit(X_kitchen_dining, y_kitchen_dining)
best_model_kitchen_dining = grid_search.best_estimator_
print(f"Best parameters for kitchen_dining: {grid_search.best_params_}")
print(f"Best score for kitchen_dining: {-grid_search.best_score_}")

print("Training model for category: office_supplies")
grid_search.fit(X_office_supplies, y_office_supplies)
best_model_office_supplies = grid_search.best_estimator_
print(f"Best parameters for office_supplies: {grid_search.best_params_}")
print(f"Best score for office_supplies: {-grid_search.best_score_}")

print("Training model for category: pet")
grid_search.fit(X_pet, y_pet)
best_model_pet = grid_search.best_estimator_
print(f"Best parameters for pet: {grid_search.best_params_}")
print(f"Best score for pet: {-grid_search.best_score_}")

print("Training model for category: sports_outdoors")
grid_search.fit(X_sports_outdoors, y_sports_outdoors)
best_model_sports_outdoors = grid_search.best_estimator_
print(f"Best parameters for sports_outdoors: {grid_search.best_params_}")
print(f"Best score for sports_outdoors: {-grid_search.best_score_}")

print("Training model for category: tools_home_improvement")
grid_search.fit(X_tools_home_improvement, y_tools_home_improvement)
best_model_tools_home_improvement = grid_search.best_estimator_
print(f"Best parameters for tools_home_improvement: {grid_search.best_params_}")
print(f"Best score for tools_home_improvement: {-grid_search.best_score_}")

print("Training model for category: toys_games")
grid_search.fit(X_toys_games, y_toys_games)
best_model_toys_games = grid_search.best_estimator_
print(f"Best parameters for toys_games: {grid_search.best_params_}")
print(f"Best score for toys_games: {-grid_search.best_score_}")

In [None]:
df_final = pd.DataFrame()

# 7. Optimize

In [None]:
for index, row in df_lastest.iterrows():


    cat_product = df_lastest[index:index+1]['Category'].values[0]
    df_optimize = df_lastest[index:index+1][['dayofweek', 'dayofmonth', 'dayofyear', 'is_weekend',
                                             'weekday_weekend', 'weekofyear', 'weekofmonth', 'month',
                                             'quarter', 'days_in_month', 'year']]
    
    multipliers = [round(x * 0.01, 2) for x in range(-50,51)]
    df_optimize = df_optimize.loc[np.repeat(df_optimize.index.values, len(multipliers))]
    df_optimize['change_price'] = multipliers

    change_price_list = []

    for i in range(df_optimize.shape[0]):
        if row['Category'] == 'Arts and Crafts':
            demand_change = best_model_art_crafts.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Automotive':
            demand_change = best_model_automotive.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Books and Stationery':
            demand_change = best_model_book_stationary.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Clothing and Accessories':
            demand_change = best_model_clothing_acc.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Crafts and Hobbies':
            demand_change = best_model_crafts_hobbies.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Electronics':
            demand_change = best_model_electronics.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Food and Beverages':
            demand_change = best_model_food_beverages.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Health and Beauty':
            demand_change = best_model_health_beauty.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Home and Garden':
            demand_change = best_model_home_garden.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Kitchen and Dining':
            demand_change = best_model_kitchen_dining.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Office Supplies':
            demand_change = best_model_office_supplies.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Pet':
            demand_change = best_model_pet.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Sports and Outdoors':
            demand_change = best_model_sports_outdoors.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Tools and Home Improvement':
            demand_change = best_model_tools_home_improvement.predict(df_optimize.iloc[i:i+1])[0]
        elif row['Category'] == 'Toys and Games':
            demand_change = best_model_toys_games.predict(df_optimize.iloc[i:i+1])[0]
        else:
            demand_change = None

        change_price_list.append(demand_change)

    df_optimize['change_qty'] = change_price_list

    initial_price = df_lastest[df_lastest['StockCode'] == row['StockCode']]['Price'].iloc[0]
    initial_cost_price = df_lastest[df_lastest['StockCode'] == row['StockCode']]['Cost_price'].iloc[0]
    initial_quantity = df_lastest[df_lastest['StockCode'] == row['StockCode']]['Quantity'].iloc[0]

    df_optimize['price_new'] = initial_price * (1 + df_optimize['change_price'])
    df_optimize['qty_new'] = initial_quantity * (1 + df_optimize['change_qty'])


    df_optimize['additional_profit'] = df_optimize['qty_new'] * (df_optimize['price_new'] - initial_cost_price)
    df_optimize = df_optimize[df_optimize['additional_profit'] == df_optimize['additional_profit'].max()]

    df_final_append = df_lastest[index:index+1][['StockCode', 'Category', 'Description', 'yyyymmdd', 'Quantity', 'Price']]
    df_final_append['change_price'] = df_optimize['change_price'].values[0]
    df_final_append['change_qty'] = df_optimize['change_qty'].values[0]
    df_final_append['price_new'] = df_optimize['price_new'].values[0]
    df_final_append['qty_new'] = df_optimize['qty_new'].values[0]
    df_final_append['additional_profit'] = df_optimize['additional_profit'].values[0]

    df_final = pd.concat([df_final, df_final_append], ignore_index=True)

In [None]:
df_final.head()

In [None]:
rev_old = (df_final['Quantity']*df_final['Price']).sum()

In [None]:
rev_new = (df_final['qty_new']*df_final['price_new']).sum()

In [None]:
(rev_new-rev_old)/rev_old