In [None]:
%%pyspark

spark.sql("create database iowa")

In [37]:
%%pyspark

df = spark.read.load('abfss://iowabyyear@eightfive.dfs.core.windows.net/IowaLiquorSalesdf2022.parquet', format='parquet')
df.write.mode("overwrite").saveAsTable("iowa.Iowa2022")
iowa2022 = sqlContext.table("iowa.iowa2022")

StatementMeta(efsynapsespark, 24, 37, Finished, Available)

In [38]:
%%pyspark
from pyspark.sql.types import StringType, DecimalType, IntegerType

iowaPopulation = spark.read.load('abfss://iowafiles@eightfive.dfs.core.windows.net/IowaPopulation.csv', format='csv', header=True)


StatementMeta(efsynapsespark, 24, 38, Finished, Available)

In [43]:
from pyspark.sql.functions import regexp_replace, col, substring, length, split, lit, when#, otherwise
from pyspark.sql.types import IntegerType

iowaPopulationRefined = (
    iowaPopulation
        .withColumn("County", 
            split(col("County").substr(lit(2), length(col("County"))), " County").getItem(0))
         .withColumn("Population", 
            regexp_replace(col("Population"), ",", "")
            .astype(IntegerType())
         
        )
)

iowaPopulationRefined = iowaPopulationRefined.withColumn("PopulationVolume", when(col("Population") > 15000, "Over").otherwise("Below"))
iowaPopulationRefined.write.mode("overwrite").saveAsTable("iowa.iowapopulation")

StatementMeta(efsynapsespark, 24, 43, Finished, Available)

In [None]:
display(iowa2022.limit(5))
iowa2022.printSchema()

In [None]:
display(iowaPopulationRefined.limit(5))
iowaPopulationRefined.printSchema()

In [None]:
#Data source: Data Frame

from pyspark.sql.functions import lower, sum, avg, col, initcap, round, date_format, trunc, first, row_number
from pyspark.sql.window import Window

rowNumberWindow = Window.partitionBy(col("Date")).orderBy(col("County"))
sumWindow1 = Window.partitionBy(col("Date"))

iowa2022agg = (
    iowa2022
        .withColumn("County", initcap("County"))
        .withColumn("Date", trunc(col("Date"), "month"))
        .filter( (col("Date").isNotNull()) & (col("Date") >= "2022-01-01") )
        .join(iowaPopulationRefined, on="County", how="leftouter")
        .groupBy(col("Date"), col("County"))
        .agg(
            round(sum("SaleDollars")).alias("Sum of Sales"), 
            round(avg("SaleDollars"), 2).alias("Avg of Sales"), 
            first(iowaPopulationRefined["Population"]).alias("Population"),
            )
        .filter( (col("Sum of Sales").between(10, 1000000)) & (col("County").like("M%")) )
        .withColumn("Rank", row_number().over(rowNumberWindow))
        .withColumn("Sum per Month", sum(col("Sum of Sales")).over(sumWindow1))
        .orderBy(col("Date").desc(), col("County").asc())
        .show(20)
)

In [None]:
%%sql
--Data source: databricks tables
SELECT
   TRUNC(i22.Date, "MM") AS `Date`
  ,INITCAP(i22.County) AS `County`
  ,ROUND(SUM(i22.SaleDollars)) AS `Sum of Sales`
  ,ROUND(AVG(i22.SaleDollars), 2) AS `Avg of Sales`
  ,FIRST(ip.Population) AS `Population`
  ,ROW_NUMBER() OVER(PARTITION BY TRUNC(i22.Date, "MM") ORDER BY INITCAP(i22.County)) AS `Rank`
  ,SUM(SUM(ROUND(i22.SaleDollars))) OVER(PARTITION BY TRUNC(i22.Date, "MM")) AS `Sum per Month`
FROM iowa.iowa2022 i22
LEFT JOIN iowa.iowapopulation ip
    ON INITCAP(i22.County) = ip.County
WHERE `Date` IS NOT NULL AND `Date` >= "2022-01-01"
GROUP BY TRUNC(i22.Date, "MM"), INITCAP(i22.County)
HAVING `Sum of Sales` BETWEEN 10 AND 1000000 AND `County` LIKE "M%"
ORDER BY `Date` DESC, `County` ASC


In [None]:
iowa2022.select(first(col("City"))).show()

In [None]:
i = iowaPopulationRefined.take(2)
print(type(i))

for i1 in i:
    print(i1)