In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('nlp').getOrCreate()
spark

In [None]:
categories = spark.read.parquet("../../data/sklep/categories")
categories.printSchema()

In [None]:
customers = spark.read.parquet("../../data/sklep/customers")
customers.printSchema()

In [None]:
customers.show(5)

In [None]:
departments = spark.read.parquet("../../data/sklep/departments")
departments.printSchema()
departments.show(5)

In [None]:
order_items = spark.read.parquet("../../data/sklep/order_items")
order_items.printSchema()
order_items.toPandas().head(5)

In [None]:
orders = spark.read.parquet("../../data/sklep/orders")
orders.printSchema()
orders.head(5)

In [None]:
products = spark.read.parquet("../../data/sklep/products")
products.printSchema()
products.head(5)

In [None]:
join_types = ["inner", "cross", "outer", "full", "left_outer", "right_outer", "left_semi", "left_anti"]

In [None]:
categories.join(departments, categories.category_department_id == departments.department_id).show(5)

In [None]:
for join_type in join_types:
    cats_on_deps = categories.join(other=departments, on=categories.category_department_id == departments.department_id, how=join_type)
    count = cats_on_deps.count()
    print(join_type, count)

In [None]:
order_items.join(products, order_items.order_item_product_id == products.product_id).show(5)

In [None]:
spark

In [None]:
customers.createOrReplaceTempView("customers")

In [None]:
df = spark.sql("SELECT * FROM customers")

In [None]:
df.show(5)

In [None]:
df.toPandas().head(5)

In [None]:
data_frames = [categories, departments, order_items, orders, products]
data_frames_names = ["categories", "departments", "order_items", "orders", "products"]
for df, df_name in zip(data_frames, data_frames_names):
    df.createOrReplaceTempView(df_name)

In [None]:
spark.sql("select * from categories").show(5)

In [None]:
spark.sql("select category_id, category_name from categories").show(5)

In [None]:
spark.sql("select distinct category_id, category_name from categories where category_id = 1").show(5)

In [None]:
with open("../../src/sql/select_category.sql", "r") as f:
    category_query_template = f.read()


In [None]:
category_query_template

In [None]:
category_query_template.format(category_id=1)

In [None]:
spark.sql(category_query_template.format(category_id=1)).show(5)

In [None]:
def get_query_template(query_name: str) -> str:
    with open(f"../../src/sql/{query_name}.sql", "r") as f:
        return f.read()

def get_query(query_name: str, **kwargs) -> str:
    return get_query_template(query_name).format(**kwargs)

In [None]:
spark.sql(get_query("select_category", category_id=1)).show(5)

In [None]:
orders.show(5)

In [None]:
orders.select("order_status").distinct().show()

In [None]:
spark.sql("select distinct order_status from orders").show()

In [None]:
df = spark.sql("select distinct order_status from orders")
df

In [None]:
df.collect()

In [None]:
df.collect()[0]

In [None]:
df.collect()[0].order_status

In [None]:
df.collect()[0]["order_status"]

In [None]:
df.collect()[0][0]

In [None]:
df.collect()[0].asDict()

In [None]:
df = spark.sql("select distinct order_status from orders order by order_status")
df.show()

In [None]:
orders.select("order_status").distinct().orderBy("order_status").show()

In [None]:
query = """
select c.category_name, count(order_item_quantity) as count
from order_items oi
inner join products p on oi.order_item_product_id = p.product_id
inner join categories c on c.category_id = p.product_category_id
group by c.category_name
order by count desc
limit 10;
"""

In [None]:
spark.sql(query).show()

In [None]:
query = get_query("top_categories")
query

In [None]:
spark.sql(query).show()

In [None]:
query = """
select
    oi.order_item_product_id,
    sum(cast(oi.order_item_subtotal as float)) as revenue
from order_items oi
inner join orders o on oi.order_item_order_id = o.order_id
where o.order_status <> 'CANCELED' and o.order_status <> 'SUSPECTED_FRAUD'
group by order_item_product_id;
"""

In [None]:
spark.sql(query).show()

In [None]:
spark

In [None]:
categories.write.mode("overwrite").parquet("../../data/categories.parquet")

In [None]:
customers.write.mode("overwrite").csv("../../data/customers.csv")