In [None]:
## HED00089
## https://hedonism.co.uk/product/springbank-24-year-old-single-cask-uk-customers-1994-whisky

In [2]:
# Objective

# 1. Removing duplicates from the stocks table
# --- Check row count by import date
# --- Identify duplicates and de-duplicate them by averaging availability, use stocks_table schema
# --- Export new_28_df to correct_rows_2024_03_28.csv
# --- Delete the relevant codes from the stocks table

# 2. Re-creating the csv file of 2024-03-28 so it doesn't contain duplicates
# --- Insert the new correct_rows_2024_03_28.csv with the de-duplicated rows into the stocks table
# --- Export the stocks table for the 28th into its own CSV, remember to use the original schema and column names
# --- UPDATE import_dates
# --- Check URLs have been populated by the main.py process

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import duckdb


# Specify the file path for the DuckDB database
db_path = '/Users/MacUser/hedonism-wines_fresh/database.db'  # Example path, replace with your desired path

# Establish a connection to an in-memory DuckDB database
conn = duckdb.connect(database=db_path, read_only=False)


In [6]:
	results = conn.execute("""
                       WITH todays_items AS (             
                        SELECT code, title, url, price_gbp, availability, import_date 
                        FROM whisky_stocks_table
                        WHERE import_date = CURRENT_DATE()
                       ),
                       yesterdays_items AS (
                        SELECT code, title, url, price_gbp, availability, import_date  
                        FROM whisky_stocks_table
                        WHERE import_date = CURRENT_DATE() -1
                       )
                       SELECT CAST(CURRENT_DATE() AS DATE) AS import_date, a.code,
						a.title, 
						a.url, 
						a.price_gbp,
						a.today_availability availability,
                        CAST(a.yesterday_availability AS FLOAT) - CAST(a.today_availability AS FLOAT) units_sold
                       FROM 
                       (
                       SELECT 
                       CAST (y.code AS STRING) ||'-'|| CAST (y.availability AS STRING) yesterday_code_availability,
                       CAST (t.code AS STRING) ||'-'|| CAST (t.availability AS STRING) today_code_availability,
                       y.code,
                       y.title,
                       y.url,
                       y.price_gbp,
                       y.availability yesterday_availability,
                       t.availability today_availability
                       FROM yesterdays_items y LEFT OUTER JOIN todays_items t
                       ON y.code = t.code
                       ) a
                       WHERE a.today_code_availability <> yesterday_code_availability
					   AND CAST(a.yesterday_availability AS FLOAT) - CAST(a.today_availability AS FLOAT) > 0
                       ORDER BY price_gbp DESC
						    """).fetchdf()

	# Convert the results to a DataFrame
	df = pd.DataFrame(results)
df

Unnamed: 0,import_date,code,title,url,price_gbp,availability,units_sold
0,2025-01-07,HED89678,Macallan 18 Year Old Sherry Oak 1997,https://hedonism.co.uk/product/macallan-18-yea...,1200.0,2.0,1.0
1,2025-01-07,HED00263,Glenfarclas 24 Year Old Millennium Cask (Third...,https://hedonism.co.uk/product/glenfarclas-24-...,510.0,17.0,1.0
2,2025-01-07,HED3140,Bowmore 25 Year Old,https://hedonism.co.uk/product/bowmore-25-year...,450.0,3.0,2.0
3,2025-01-07,HED40141,Macallan 18 Year Old Double Cask,https://hedonism.co.uk/product/macallan-18-yea...,310.0,32.0,1.0
4,2025-01-07,HED69377,Hazelburn 10 Year Old Single Cask,https://hedonism.co.uk/product/hazelburn-10-ye...,295.0,3.0,1.0
5,2025-01-07,HED79481,Daftmill Winter Distillation 2007,https://hedonism.co.uk/product/daftmill-winter...,295.0,1.0,1.0
6,2025-01-07,HED21889,Bladnoch 23 Year Old Fresh Amontillado Cask (E...,https://hedonism.co.uk/product/bladnoch-23-yea...,295.0,229.0,3.0
7,2025-01-07,HED47173,Springbank 12 Year Old Cask Strength 57.2%,https://hedonism.co.uk/product/springbank-12-y...,275.0,15.0,1.0
8,2025-01-07,HED00230,Bladnoch 22 Year Old Canasta Sherry Butt 2001,https://hedonism.co.uk/product/bladnoch-22-yea...,250.0,1.0,1.0
9,2025-01-07,HED3110,Balvenie 21 Year Old Portwood,https://hedonism.co.uk/product/balvenie-21-yea...,243.0,12.0,6.0


In [None]:
    # Get today's date
from datetime import datetime

today_date_file_name = datetime.now().strftime("_%Y_%m_%d")

    # Define filename with today's date appended
filename = f"sales{today_date_file_name}.csv"  # Change "data" to your desired filename prefix
        
    # Define the path where you want to save the file
folder_path = "/Users/MacUser/hedonism-wines_fresh/sales_data"  # Change this to your desired folder path


	
    
    # Export dataframe
df.to_csv(folder_path + filename, index=False)

	# Convert import_date to datetime
df['import_date'] = pd.to_datetime(df['import_date'])

	# Extract date part
df['import_date'] = df['import_date'].dt.date
df['import_date'] = df['import_date'].astype(str).str[:10]

: 

In [3]:
	results = conn.execute("""
                       WITH todays_items AS (             
                        SELECT code, title, url, price_gbp, availability, import_date 
                        FROM whisky_stocks_table
                        WHERE import_date = CURRENT_DATE()
                       ),
                       yesterdays_items AS (
                        SELECT code, title, url, price_gbp, availability, import_date  
                        FROM whisky_stocks_table
                        WHERE import_date = CURRENT_DATE() -1
                       )
                       SELECT CAST(CURRENT_DATE() AS DATE) AS import_date, a.code,
						a.title, 
						a.url, 
						a.price_gbp,
						a.today_availability availability,
                        CAST(a.yesterday_availability AS FLOAT) - CAST(a.today_availability AS FLOAT) units_sold
                       FROM 
                       (
                       SELECT 
                       CAST (y.code AS STRING) ||'-'|| CAST (y.availability AS STRING) yesterday_code_availability,
                       CAST (t.code AS STRING) ||'-'|| CAST (t.availability AS STRING) today_code_availability,
                       y.code,
                       y.title,
                       y.url,
                       y.price_gbp,
                       y.availability yesterday_availability,
                       t.availability today_availability
                       FROM yesterdays_items y LEFT OUTER JOIN todays_items t
                       ON y.code = t.code
                       ) a
                       WHERE a.today_code_availability <> yesterday_code_availability
					   AND CAST(a.yesterday_availability AS FLOAT) - CAST(a.today_availability AS FLOAT) > 0
                       ORDER BY price_gbp DESC
                """).fetchdf()

	# Convert the results to a DataFrame
	df = pd.DataFrame(results)

In [4]:
df.head()

Unnamed: 0,import_date,code,title,url,price_gbp,availability,units_sold


In [12]:
# 1. Check row count by import date
results = conn.execute("""                
                        SELECT price_gbp, price_incl_vat,
                       COALESCE(price_gbp, price_incl_vat) AS price_gbp,
                       COALESCE(price_incl_vat, 0) price_incl_vat
                       FROM whisky_stocks_table WHERE price_incl_vat IS NULL
            """).fetchdf()
	# Convert the results to a DataFrame
df = pd.DataFrame(results)
df.head()

Unnamed: 0,price_gbp,price_incl_vat,price_gbp_1,price_incl_vat_1
0,520.0,,520.0,0.0
1,2400.0,,2400.0,0.0
2,7250.0,,7250.0,0.0
3,195.0,,195.0,0.0
4,4895.0,,4895.0,0.0


: 

In [21]:
df['title'] = df['title'].str.replace("`",'')

In [22]:
df['title'] 

0                           Lossit Archivists Selection
1                                  Ichiros Malt & Grain
2                  Ushers Old Vatted Glenlivet c. 1920s
3                               Borderers Blend c. 1910
4                               Bowmore Devils Cask III
                            ...                        
71           Lagavulin Distillers Edition LG.4/490 1986
72    Balvenie 15 Year Old Craftsmans Reserve No.1 C...
73           Lagavulin Distillers Edition LG.4/501 1995
74    Unnamed Islay (LA) 31 Year Old Symingtons Choi...
75    Dalmore 16 Year Old Luminary No.2 2024 Edition...
Name: title, Length: 76, dtype: object

In [25]:
df['title'].head(30)

0                           Lossit Archivists Selection
1                                  Ichiros Malt & Grain
2                  Ushers Old Vatted Glenlivet c. 1920s
3                               Borderers Blend c. 1910
4                               Bowmore Devils Cask III
5                               Ballantines 40 Year Old
6                            Glenlivet Founders Reserve
7                   Ichiros Malt Hanyu 15 Year Old 2000
8     Bowmore 42 Year Old Black The Trilogy Edition ...
9     Ardbeg 22 Year Old Managers Choice Sherry Butt...
10                                     Ichiros Malt MWR
11                       Ichiros Malt Wine Wood Reserve
12                                      Tomintoul TLath
13                  Knockando 12 Year Old Managers Dram
14                           Bains Cape Mountain Whisky
15             Teaninich 17 Year Old Managers Dram 2001
16     Ichiros Malt Hanyu Ten Diamonds 21 Year Old 1990
17                                  Bowmore Devi

In [26]:
import re

def remove_punctuation(text):
    # Define a regular expression pattern to match punctuation characters
    punctuation_pattern = r'[^\w\s]'
    
    # Use re.sub() to replace all punctuation characters with an empty string
    clean_text = re.sub(punctuation_pattern, '', text)
    
    return clean_text

# Example usage:
#text_with_punctuation = "Hello, world! How are you doing today?"
#clean_text = remove_punctuation(text_with_punctuation)
#print(clean_text)
df['title'] = df['title'].apply(remove_punctuation)

In [27]:
df['title']

0                           Lossit Archivists Selection
1                                   Ichiros Malt  Grain
2                   Ushers Old Vatted Glenlivet c 1920s
3                                Borderers Blend c 1910
4                               Bowmore Devils Cask III
                            ...                        
71             Lagavulin Distillers Edition LG4490 1986
72    Balvenie 15 Year Old Craftsmans Reserve No1 Co...
73             Lagavulin Distillers Edition LG4501 1995
74    Unnamed Islay LA 31 Year Old Symingtons Choice...
75    Dalmore 16 Year Old Luminary No2 2024 Edition ...
Name: title, Length: 76, dtype: object

: 

In [8]:
# 2. Identify duplicates and de-duplicate them by averaging availability
results = conn.execute("""                
                        WITH duplicates as (
                                              SELECT COUNT (*) duplicate_count, code	  
                                            FROM stocks_table
                                            WHERE import_date = '2024-03-28'
                                            GROUP BY code
                                            HAVING COUNT (*) > 1)
                       SELECT 
                       s.code, --Code
                       s.title, -- Title
                       s.size, -- Size
                       s.style, --Style
                       s.country, --Country
                       s.type, --Group
                       ROUND(AVG(s.availability),0) availability, --Available
                       AVG(s.price_gbp) price_gbp --"Price (GBP)"
                       FROM stocks_table s JOIN duplicates d on s.code = d.code
                       WHERE import_date = '2024-03-28'
                       GROUP BY 
                      s.code,
                       s.title,
                       s.size,
                       s.style,
                       s.country,
                       s.type
            """).fetchdf()
	# Convert the results to a DataFrame
new_28_df = pd.DataFrame(results)
new_28_df

Unnamed: 0,code,title,size,style,country,type,availability,price_gbp
0,HED0112,Highland Park 18 Year Old,70cl,Spirits,Scotland,Whisky,12.0,125.000000
1,HED0544,Leoville Las Cases 2005,75cl,Red,France,Bordeaux,3.0,340.000000
2,HED0581,Margaux 2005,75cl,Red,France,Bordeaux,10.0,1050.000000
3,HED0699,Petrus 1995,75cl,Red,France,Bordeaux,20.0,4980.000000
4,HED1798,Krug Clos du Mesnil 2000,75cl,White,France,Champagne,5.0,1380.000000
...,...,...,...,...,...,...,...,...
2616,HED87539,Rieussec Half 2007,37.5cl,White,France,Sauternes & Barsac,5.0,40.000000
2617,HED21088,Sheringham Distillery Seaside Gin,70cl,Spirits,Canada,Gin,2.0,38.900002
2618,HED96656,Puligny Montrachet Champs Gains Borgeot 2002,75cl,White,France,Burgundy,2.0,72.099998
2619,HED24318,Von Buhl Riesling Bone Dry QbA Magnum 2022,150cl,White,Germany,Germany,6.0,40.900002


In [9]:
# 3. Export correct list of previously duplicated ids into their own dataset
new_28_df.to_csv('/Users/MacUser/hedonism-wines_app/correct_rows_2024_03_28.csv',index=False)

In [10]:
# 4. Execute DELETE queries and commit the changes
delete_query_1 = f"""

                        WITH duplicates as (
                                              SELECT COUNT (*) duplicate_count, code	  
                                            FROM stocks_table
                                            WHERE import_date = '2024-03-28'
                                            GROUP BY code
                                            HAVING COUNT (*) > 1),
                       main AS (
                       SELECT 
                       s.code, --Code
                       s.title, -- Title
                       s.size, -- Size
                       s.style, --Style
                       s.country, --Country
                       s.type, --Group
                       ROUND(AVG(s.availability),0) availability, --Available
                       AVG(s.price_gbp) price_gbp --"Price (GBP)"
                       FROM stocks_table s JOIN duplicates d on s.code = d.code
                       WHERE import_date = '2024-03-28'
                       GROUP BY 
                      s.code,
                       s.title,
                       s.size,
                       s.style,
                       s.country,
                       s.type)
                    DELETE FROM 
                    stocks_table 
                    WHERE import_date = '2024-03-28'
                    AND code IN (
                        SELECT code from main)
                        """
conn.execute(delete_query_1)

<duckdb.duckdb.DuckDBPyConnection at 0x7fb6228c2cb0>

In [11]:
conn.commit()

<duckdb.duckdb.DuckDBPyConnection at 0x7fb6228c2cb0>

In [13]:
# 5. Reimport the cleaned rows back into the main table
df=pd.read_csv('/Users/MacUser/hedonism-wines_app/correct_rows_2024_03_28.csv')
df.head()

Unnamed: 0,code,title,size,style,country,type,availability,price_gbp
0,HED0112,Highland Park 18 Year Old,70cl,Spirits,Scotland,Whisky,12.0,125.0
1,HED0544,Leoville Las Cases 2005,75cl,Red,France,Bordeaux,3.0,340.0
2,HED0581,Margaux 2005,75cl,Red,France,Bordeaux,10.0,1050.0
3,HED0699,Petrus 1995,75cl,Red,France,Bordeaux,20.0,4980.0
4,HED1798,Krug Clos du Mesnil 2000,75cl,White,France,Champagne,5.0,1380.0


In [14]:
    # Get the column names from the DataFrame
columns = list(df.columns)

    # Generate the list of column names for the INSERT INTO statement
column_names = ", ".join(columns)

    # Generate the list of parameter placeholders (?, ?, ?) for the VALUES clause
parameter_placeholders = ", ".join(["?" for _ in range(len(columns))])

    # Convert the DataFrame to records list
records = df.values.tolist()

    # Define the name of your existing table
table_name = 'stocks_table'

    # Construct the SQL INSERT INTO statement dynamically
sql_insert = f"INSERT INTO {table_name} ({column_names}) VALUES ({parameter_placeholders})"

    # Execute the INSERT statement
conn.executemany(sql_insert, records)

<duckdb.duckdb.DuckDBPyConnection at 0x7fb6228c2cb0>

In [17]:
# Execute UPDATE queries to fix null import_dates
import_date_update_query = f"""
                        UPDATE stocks_table
                        SET import_date = '2024-03-28'
                        WHERE import_date IS NULL
                        """
conn.execute(import_date_update_query)

<duckdb.duckdb.DuckDBPyConnection at 0x7fb6228c2cb0>

In [18]:
conn.commit()

<duckdb.duckdb.DuckDBPyConnection at 0x7fb6228c2cb0>

In [42]:
# 2. Identify duplicates and de-duplicate them by averaging availability
results = conn.execute("""                
                       SELECT
                      s.code Code,
                      s.title Title,
                      s.size Size,
                      s.style Style,
                      s.country Country,
                      s.type AS Group,
                      s.availability Available,
                      s.price_gbp "Price (GBP)"
                       FROM stocks_table s
                       WHERE import_date = '2024-03-28'
            """).fetchdf()
	# Convert the results to a DataFrame
df = pd.DataFrame(results)
df

Unnamed: 0,Code,Title,Size,Style,Country,Group,Available,Price (GBP)
0,HED0037,Louis XIII,70cl,Spirits,France,Cognac,4.0,2950.000000
1,HED0059,Ardbeg 17 Year Old,70cl,Spirits,Scotland,Whisky,2.0,595.000000
2,HED0062,Ardbeg Lord of the Isles,70cl,Spirits,Scotland,Whisky,2.0,2400.000000
3,HED0068,Balvenie 40 Year Old,70cl,Spirits,Scotland,Whisky,4.0,7250.000000
4,HED0097,Glenfiddich 40 Year Old Cumulative Time,70cl,Spirits,Scotland,Whisky,1.0,3700.000000
...,...,...,...,...,...,...,...,...
10798,HED87539,Rieussec Half 2007,37.5cl,White,France,Sauternes & Barsac,5.0,40.000000
10799,HED21088,Sheringham Distillery Seaside Gin,70cl,Spirits,Canada,Gin,2.0,38.900002
10800,HED96656,Puligny Montrachet Champs Gains Borgeot 2002,75cl,White,France,Burgundy,2.0,72.099998
10801,HED24318,Von Buhl Riesling Bone Dry QbA Magnum 2022,150cl,White,Germany,Germany,6.0,40.900002


In [43]:
df.to_csv('/Users/MacUser/hedonism-wines_app/full-stock-list_2024_03_28.csv',index=False)

In [44]:
conn.close()

In [5]:
pip install vega_datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting vega_datasets
  Downloading vega_datasets-0.9.0-py3-none-any.whl.metadata (5.5 kB)
Downloading vega_datasets-0.9.0-py3-none-any.whl (210 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.8/210.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: vega_datasets
Successfully installed vega_datasets-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Objective: find which whiskies have been sold the day before today
# 1. Items where the availability has gone down
# 2. Codes that no longer exist, i.e. availability was = 1 and now they're no longer in the dataset after being sold

In [5]:
# 2. Identify duplicates and de-duplicate them by averaging availability
results = conn.execute("""   
                       WITH todays_items AS (             
                        SELECT code, title, url, price_gbp, availability, import_date 
                        FROM whisky_stocks_table
                        WHERE import_date = CURRENT_DATE()
                       ),
                       yesterdays_items AS (
                        SELECT code, title, url, price_gbp, availability, import_date  
                        FROM whisky_stocks_table
                        WHERE import_date = CURRENT_DATE() -1
                       )
                       SELECT CAST(CURRENT_DATE() AS DATE) AS import_date, a.code, 
						a.title, 
						a.url, 
						a.price_gbp,
						a.today_availability availability,
                        a.yesterday_availability - a.today_availability units_sold
                       FROM 
                       (
                       SELECT 
                       CAST (y.code AS STRING) ||'-'|| CAST (y.availability AS STRING) yesterday_code_availability,
                       CAST (t.code AS STRING) ||'-'|| CAST (t.availability AS STRING) today_code_availability,
                       y.code,
                       y.title,
                       y.url,
                       y.price_gbp,
                       y.availability yesterday_availability,
                       t.availability today_availability
                       FROM yesterdays_items y LEFT OUTER JOIN todays_items t
                       ON y.code = t.code
                       ) a
                       WHERE a.today_code_availability <> yesterday_code_availability
                       ORDER BY price_gbp DESC

            """).fetchdf()
	# Convert the results to a DataFrame
df = pd.DataFrame(results)
df.to_csv('/Users/MacUser/hedonism-wines_app/sales_2024_04_17.csv',index=False)
df

Unnamed: 0,import_date,code,title,url,price_gbp,availability,units_sold
0,2024-04-17,HED46498,A Trail of Smoke 42 Year Old House of Hazelwoo...,https://hedonism.co.uk/product/a-trail-of-smok...,1900.0,2.0,1.0
1,2024-04-17,HED21580,Macallan Oscuro,https://hedonism.co.uk/product/macallan-oscuro...,1450.0,1.0,1.0
2,2024-04-17,HED46453,Bowmore 22 Year Old Aston Martin Master`s Sele...,https://hedonism.co.uk/product/bowmore-22-year...,425.0,3.0,2.0
3,2024-04-17,HED46709,Macallan Rare Cask (2023 Release),https://hedonism.co.uk/product/macallan-rare-c...,310.0,12.0,6.0
4,2024-04-17,HED21064,Speyside 18 Year Old (M) Cask Strength Collect...,https://hedonism.co.uk/product/speyside-18-yea...,212.0,35.0,1.0
5,2024-04-17,HED3161,Yamazaki 12 Year Old,https://hedonism.co.uk/product/yamazaki-12-yea...,145.0,34.0,2.0
6,2024-04-17,HED21048,Caol Ila 16 Year Old Cask Strength Collection ...,https://hedonism.co.uk/product/caol-ila-16-yea...,145.0,4.0,2.0
7,2024-04-17,HED78927,Glen Scotia 18 Year Old,https://hedonism.co.uk/product/glen-scotia-18-...,128.0,3.0,2.0
8,2024-04-17,HED47122,Kilchoman Batch Strength,https://hedonism.co.uk/product/kilchoman-batch...,73.0,3.0,3.0
9,2024-04-17,HED21147,Port Charlotte Islay Barley 2014,https://hedonism.co.uk/product/port-charlotte-...,69.5,2.0,1.0


: 