In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
import os
import sys
from pyspark.sql.functions import *
from calendar import month_name
from pyspark.ml.feature import *
from pyspark.sql.types import *
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
import re

In [2]:
spark = SparkSession.builder \
    .appName('app_name') \
    .master('local[*]') \
    .config('spark.sql.execution.arrow.pyspark.enabled', True) \
    .config('spark.sql.session.timeZone', 'UTC') \
    .config('spark.driver.memory','32G') \
    .config('spark.ui.showConsoleProgress', True) \
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .getOrCreate()

+-----+
|hello|
+-----+
|spark|
+-----+



In [3]:
liquor_data_df = spark.read.csv("DE_CaseStudy_Dataset/Sales_Data/Liquor_Sales.csv", header=True, inferSchema=True)

liquor_data_df.printSchema()

root
 |-- Invoice/Item Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Store Number: integer (nullable = true)
 |-- Store Name: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip Code: string (nullable = true)
 |-- Store Location: string (nullable = true)
 |-- County Number: integer (nullable = true)
 |-- County: string (nullable = true)
 |-- Category: integer (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- Vendor Number: integer (nullable = true)
 |-- Vendor Name: string (nullable = true)
 |-- Item Number: string (nullable = true)
 |-- Item Description: string (nullable = true)
 |-- Pack: integer (nullable = true)
 |-- Bottle Volume (ml): integer (nullable = true)
 |-- State Bottle Cost: double (nullable = true)
 |-- State Bottle Retail: double (nullable = true)
 |-- Bottles Sold: integer (nullable = true)
 |-- Sale (Dollars): double (nullable = true)
 |-- Volume Sold (Liters): doub

In [4]:
liquor_data_df.show()

+-------------------+----------+------------+--------------------+--------------------+---------------+--------+--------------------+-------------+-------------+--------+--------------------+-------------+--------------------+-----------+--------------------+----+------------------+-----------------+-------------------+------------+--------------+--------------------+---------------------+
|Invoice/Item Number|      Date|Store Number|          Store Name|             Address|           City|Zip Code|      Store Location|County Number|       County|Category|       Category Name|Vendor Number|         Vendor Name|Item Number|    Item Description|Pack|Bottle Volume (ml)|State Bottle Cost|State Bottle Retail|Bottles Sold|Sale (Dollars)|Volume Sold (Liters)|Volume Sold (Gallons)|
+-------------------+----------+------------+--------------------+--------------------+---------------+--------+--------------------+-------------+-------------+--------+--------------------+-------------+---------

In [5]:
liquor_mapped_df = liquor_data_df.select(
    col("Invoice/Item Number").alias("invoice_item_number"),
    col("Date").alias("date"),
    col("Store Number").alias("store_number"),
    col("Store Name").alias("store_name"),
    col("Address").alias("address"),
    col("City").alias("city"),
    col("Zip Code").alias("zip_code"),
    col("Store Location").alias("store_location"),
    col("County Number").alias("county_number"),
    col("County").alias("county"),
    col("Category").alias("category"),
    col("Category Name").alias("category_name"),
    col("Vendor Number").alias("vendor_number"),
    col("Vendor Name").alias("vendor_name"),
    col("Item Number").alias("item_number"),
    col("Item Description").alias("item_description"),
    col("Pack").alias("pack"),
    col("Bottle Volume (ml)").alias("bottle_volume_ml"),
    col("State Bottle Cost").alias("state_bottle_cost"),
    col("State Bottle Retail").alias("state_bottle_retail"),
    col("Bottles Sold").alias("bottles_sold"),
    col("Sale (Dollars)").alias("sale_dollars"),
    col("Volume Sold (Liters)").alias("volume_sold_liters"),
    col("Volume Sold (Gallons)").alias("volume_sold_gallons")
)
liquor_mapped_df.show()

+-------------------+----------+------------+--------------------+--------------------+---------------+--------+--------------------+-------------+-------------+--------+--------------------+-------------+--------------------+-----------+--------------------+----+----------------+-----------------+-------------------+------------+------------+------------------+-------------------+
|invoice_item_number|      date|store_number|          store_name|             address|           city|zip_code|      store_location|county_number|       county|category|       category_name|vendor_number|         vendor_name|item_number|    item_description|pack|bottle_volume_ml|state_bottle_cost|state_bottle_retail|bottles_sold|sale_dollars|volume_sold_liters|volume_sold_gallons|
+-------------------+----------+------------+--------------------+--------------------+---------------+--------+--------------------+-------------+-------------+--------+--------------------+-------------+--------------------+----

In [6]:
liquor_data_clean_df = liquor_mapped_df.na.drop("any")

In [7]:
# Check for nulls in each row
columns_with_nulls = [col(column).isNull().alias(column) for column in liquor_data_clean_df.columns]

print(columns_with_nulls)

[Column<'(invoice_item_number IS NULL) AS invoice_item_number'>, Column<'(date IS NULL) AS date'>, Column<'(store_number IS NULL) AS store_number'>, Column<'(store_name IS NULL) AS store_name'>, Column<'(address IS NULL) AS address'>, Column<'(city IS NULL) AS city'>, Column<'(zip_code IS NULL) AS zip_code'>, Column<'(store_location IS NULL) AS store_location'>, Column<'(county_number IS NULL) AS county_number'>, Column<'(county IS NULL) AS county'>, Column<'(category IS NULL) AS category'>, Column<'(category_name IS NULL) AS category_name'>, Column<'(vendor_number IS NULL) AS vendor_number'>, Column<'(vendor_name IS NULL) AS vendor_name'>, Column<'(item_number IS NULL) AS item_number'>, Column<'(item_description IS NULL) AS item_description'>, Column<'(pack IS NULL) AS pack'>, Column<'(bottle_volume_ml IS NULL) AS bottle_volume_ml'>, Column<'(state_bottle_cost IS NULL) AS state_bottle_cost'>, Column<'(state_bottle_retail IS NULL) AS state_bottle_retail'>, Column<'(bottles_sold IS NULL

In [8]:
df_with_nulls = liquor_data_clean_df.withColumn("contains_null", lit(0))
for column_with_nulls in columns_with_nulls:
    df_with_nulls = df_with_nulls.withColumn("contains_null", col("contains_null").cast("boolean") | column_with_nulls)

rows_with_nulls = df_with_nulls.filter(
    (col("contains_null") == True)
)

rows_with_nulls.show()

sales_df_cleaned = df_with_nulls.filter(
    (col("contains_null") == False) 
)

liquor_data_clean_df = liquor_data_clean_df.drop("contains_null")

liquor_data_clean_df.show()

+-------------------+----+------------+----------+-------+----+--------+--------------+-------------+------+--------+-------------+-------------+-----------+-----------+----------------+----+----------------+-----------------+-------------------+------------+------------+------------------+-------------------+-------------+
|invoice_item_number|date|store_number|store_name|address|city|zip_code|store_location|county_number|county|category|category_name|vendor_number|vendor_name|item_number|item_description|pack|bottle_volume_ml|state_bottle_cost|state_bottle_retail|bottles_sold|sale_dollars|volume_sold_liters|volume_sold_gallons|contains_null|
+-------------------+----+------------+----------+-------+----+--------+--------------+-------------+------+--------+-------------+-------------+-----------+-----------+----------------+----+----------------+-----------------+-------------------+------------+------------+------------------+-------------------+-------------+
+-------------------+-

In [9]:
# liquor_data_df.describe().show()

In [10]:
liquor_data_clean_df = liquor_data_clean_df.withColumn("date", to_timestamp("date", "MM/dd/yyyy"))
result = liquor_data_clean_df.select("date")

In [11]:
liquor_data_clean_df = liquor_data_clean_df.withColumn("order_year", year("date"))
liquor_data_clean_df = liquor_data_clean_df.withColumn("order_month", month("date"))
liquor_data_clean_df = liquor_data_clean_df.withColumn("order_day", dayofmonth("date"))
liquor_data_clean_df = liquor_data_clean_df.withColumn("order_hour", hour("date"))
liquor_data_clean_df = liquor_data_clean_df.withColumn("order_minute", minute("date"))
liquor_data_clean_df = liquor_data_clean_df.withColumn("order_second", second("date"))

In [12]:
liquor_data_clean_df.show()

+-------------------+-------------------+------------+--------------------+--------------------+---------------+--------+--------------------+-------------+-------------+--------+--------------------+-------------+--------------------+-----------+--------------------+----+----------------+-----------------+-------------------+------------+------------+------------------+-------------------+----------+-----------+---------+----------+------------+------------+
|invoice_item_number|               date|store_number|          store_name|             address|           city|zip_code|      store_location|county_number|       county|category|       category_name|vendor_number|         vendor_name|item_number|    item_description|pack|bottle_volume_ml|state_bottle_cost|state_bottle_retail|bottles_sold|sale_dollars|volume_sold_liters|volume_sold_gallons|order_year|order_month|order_day|order_hour|order_minute|order_second|
+-------------------+-------------------+------------+------------------

In [13]:
distinct_values_count = liquor_data_clean_df.groupBy('order_year').count()

distinct_values_count.show()

+----------+-------+
|order_year|  count|
+----------+-------+
|      2015|1984108|
|      2013|1872144|
|      2014|1909019|
|      2012|1884232|
|      2016|1962632|
|      2017|2028323|
|      2018|2129085|
|      2019|2158261|
|      2020|1760082|
+----------+-------+



In [13]:
liquor_distinct_names = liquor_data_clean_df.select("category_name").distinct()

liquor_distinct_names.show(truncate=False)

+----------------------------+
|category_name               |
+----------------------------+
|IMPORTED SCHNAPPS           |
|PEACH BRANDIES              |
|AMERICAN ALCOHOL            |
|IMPORTED VODKA - MISC       |
|VODKA 80 PROOF              |
|RASPBERRY SCHNAPPS          |
|American Sloe Gins          |
|BUTTERSCOTCH SCHNAPPS       |
|Straight Rye Whiskies       |
|PEPPERMINT SCHNAPPS         |
|American Cordials & Liqueurs|
|CANADIAN WHISKIES           |
|AMERICAN COCKTAILS          |
|MISCELLANEOUS SCHNAPPS      |
|APRICOT BRANDIES            |
|CREME DE ALMOND             |
|CINNAMON SCHNAPPS           |
|Spiced Rum                  |
|AMERICAN AMARETTO           |
|TEQUILA                     |
+----------------------------+
only showing top 20 rows



In [14]:
null_or_empty_rows = liquor_distinct_names.filter((col('category_name').isNull()) | (col('category_name') == ''))

null_or_empty_rows.show()

+-------------+
|category_name|
+-------------+
+-------------+



In [15]:
# def capitalize_first_letter(s):
#     print("Input value:", s)
#     result = s.title()
#     print("Output value:", result)
#     return result

# capitalize_udf = udf(capitalize_first_letter, StringType())

# liquor_distinct_names_cleaned = liquor_distinct_names.withColumn("category_name_cleaned", capitalize_udf(col("category_name")))

# liquor_distinct_names_cleaned.show()

In [16]:
# from nltk.stem import WordNetLemmatizer
# def lemmatize_words(category_name):
#     lemmatizer = WordNetLemmatizer()
#     return ' '.join([lemmatizer.lemmatize(word) for word in category_name.split()])

In [17]:
# category_names = ["cars vehicles automobile", "dogs puppies", "cats feline"]
# lemmatized_words = [lemmatize_words(category_name) for category_name in category_names]

# print(lemmatized_words)

In [18]:
# lemmatize_udf = udf(lemmatize_words)
# df_tokens_1 = liquor_distinct_names.withColumn("category_name_2", lemmatize_udf(col("category_name")))

# df_tokens_1.printSchema()

In [19]:
# df_tokens_1.select('category_name').show()

In [20]:
tokenizer = Tokenizer(inputCol="category_name", outputCol="tokens")
df_tokens_1 = tokenizer.transform(liquor_distinct_names)
df_tokens = df_tokens_1
# df_tokens = df_tokens_1.sample(fraction=0.00001, seed=42)


In [21]:
# sampled_row_count = df_tokens.count()
# print("Sampled DataFrame Row Count:", sampled_row_count)

In [22]:
df_tokens_1.printSchema()

root
 |-- category_name: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [23]:
df_tokens = df_tokens_1.cache()
word2vec = Word2Vec(vectorSize=100, minCount=5, inputCol="tokens", outputCol="features")
word2vec_model = word2vec.fit(df_tokens)
df_embeddings = word2vec_model.transform(df_tokens)

In [24]:
kmeans = KMeans(k=6, seed=1, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(df_embeddings)
df_clustered = model.transform(df_embeddings)

In [25]:
cluster_centroids = model.clusterCenters()
print("Cluster Centroids:")
for i, centroid in enumerate(cluster_centroids):
    print(f"Cluster {i}: {centroid}")

df_result = df_clustered.select("category_name", "cluster")
df_result.show(truncate= False)

Cluster Centroids:
Cluster 0: [ 1.27737941e-03  1.36509305e-03 -2.39395295e-04 -1.49798466e-03
 -1.98512327e-03  2.41788933e-03 -2.16438714e-04 -2.28344482e-03
  1.16190709e-03 -1.71618906e-03 -1.89368509e-03 -1.19050370e-04
  1.32603436e-03  2.20331801e-03 -5.22802446e-04  2.07670848e-03
 -1.06088116e-03 -1.35432919e-03  1.65550174e-03 -6.92083285e-04
 -1.96945799e-03  3.93802545e-04 -2.43830397e-03 -5.28264144e-04
  1.74664303e-03  2.46748659e-03  1.37852320e-03 -1.83908430e-03
 -1.98960747e-03 -2.36063063e-03  1.51478232e-04 -5.06523561e-05
 -2.08803214e-03  7.60605382e-04 -2.50752094e-04 -1.46276249e-03
 -2.08677266e-03 -2.22870524e-04 -1.58460482e-03  8.45705319e-04
  1.31375888e-03 -3.77004660e-04  3.95285776e-04 -9.71689790e-04
 -1.69601097e-03 -7.95215633e-04  2.78400809e-04 -2.50216606e-03
 -2.67512102e-03 -8.66733823e-04  4.01596225e-04 -1.68599541e-03
 -3.04561203e-04  2.60273102e-03  7.48104981e-04  4.63035302e-04
 -2.57100878e-03  1.88179359e-03  2.45063504e-03  9.28892538

In [26]:
df_clustered.printSchema()

root
 |-- category_name: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- cluster: integer (nullable = false)



In [27]:
df_clustered_cleaned = df_clustered.drop("tokens", "features")

In [28]:
df_clustered_cleaned.printSchema()

root
 |-- category_name: string (nullable = true)
 |-- cluster: integer (nullable = false)



In [29]:
liquor_data_clean_df.printSchema()

root
 |-- invoice_item_number: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- store_number: integer (nullable = true)
 |-- store_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- store_location: string (nullable = true)
 |-- county_number: integer (nullable = true)
 |-- county: string (nullable = true)
 |-- category: integer (nullable = true)
 |-- category_name: string (nullable = true)
 |-- vendor_number: integer (nullable = true)
 |-- vendor_name: string (nullable = true)
 |-- item_number: string (nullable = true)
 |-- item_description: string (nullable = true)
 |-- pack: integer (nullable = true)
 |-- bottle_volume_ml: integer (nullable = true)
 |-- state_bottle_cost: double (nullable = true)
 |-- state_bottle_retail: double (nullable = true)
 |-- bottles_sold: integer (nullable = true)
 |-- sale_dollars: double (nullable = true)
 |-- volume_sold_liters: double 

In [30]:
liquor_data_clean_type = liquor_data_clean_df \
    .withColumn("item_number", col("item_number").cast(IntegerType())) \
    .withColumn("vendor_number", col("vendor_number").cast(IntegerType())) 

In [31]:
jdbc_url = "jdbc:postgresql://localhost:5432/postgres"
table_name = "ecomm.liquor_category"
table_name2 = "ecomm.liquor_sales"
properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

# Write to PostgreSQL
liquor_data_clean_type.write.jdbc(url=jdbc_url, table=table_name2, mode="overwrite", properties=properties)
df_clustered_cleaned.write.jdbc(url=jdbc_url, table=table_name, mode="overwrite", properties=properties)