In [1]:

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import arrays_overlap, split, when, col

import ConnectionConfig as cc
cc.setupEnvironment()

## Start the cluster
Look at the getActiveSession() method in the ConnectionConfig.py file. It will return the active session. It will also add the delta package to the session and add extra jars to the session. The jars are needed to connect to the SQL Server database.

In [2]:
spark = cc.startLocalCluster("DIM_WEER",4)
spark.getActiveSession()

In [3]:
weather_csv = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("FileStore/tables/weather_types.csv")
weather_csv.createOrReplaceTempView("weathercsv")

In [4]:
weather_csv.printSchema()

root
 |-- weatherTypeID: integer (nullable = true)
 |-- weatherType: string (nullable = true)


In [5]:
weather_json = spark.read.option("multiline","true").json("./Filestore/weather/*.json")
weather_json.createOrReplaceTempView("weatherjson")

In [6]:
weather_json.printSchema()

root
 |-- base: string (nullable = true)
 |-- clouds: struct (nullable = true)
 |    |-- all: long (nullable = true)
 |-- cod: long (nullable = true)
 |-- coord: struct (nullable = true)
 |    |-- lat: double (nullable = true)
 |    |-- lon: double (nullable = true)
 |-- dt: long (nullable = true)
 |-- id: long (nullable = true)
 |-- main: struct (nullable = true)
 |    |-- feels_like: double (nullable = true)
 |    |-- grnd_level: long (nullable = true)
 |    |-- humidity: long (nullable = true)
 |    |-- pressure: long (nullable = true)
 |    |-- sea_level: long (nullable = true)
 |    |-- temp: double (nullable = true)
 |    |-- temp_max: double (nullable = true)
 |    |-- temp_min: double (nullable = true)
 |-- name: string (nullable = true)
 |-- rain: struct (nullable = true)
 |    |-- 1h: double (nullable = true)
 |-- snow: struct (nullable = true)
 |    |-- 1h: double (nullable = true)
 |-- sys: struct (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- id: l

In [7]:
weather_json = spark.sql("""
select weather.main, zipcode, main.temp, dt, name, rain from weatherjson
""")
weather_json.createOrReplaceTempView("weatherjson")

### ridesFactFromSource = ridesFactFromSource.withColumn("feeling", when(col("feeling").isNull(), "weertype onbekend").otherwise(col("feeling")))


In [8]:
weather_json = weather_json.withColumn(
    "weather_descriptor",
    when(col("rain").isNull() & (col("temp") >= 288.15), "aangenaam") # aanpassen TODO no hardcoded values (done!)
    .when(col("rain").isNull() & (col("temp") < 288.15), "neutraal")
    .otherwise("onaangenaam")
)

In [9]:
weather_json = weather_json.drop("main","name", "temp", "rain")

In [10]:
weather_json.show(36)

+-------+----------+------------------+
|zipcode|        dt|weather_descriptor|
+-------+----------+------------------+
|   2060|1580711134|       onaangenaam|
|   2060|1580936892|       onaangenaam|
|   2140|1727689398|       onaangenaam|
|   2140|1727689398|       onaangenaam|
|   2140|1727689398|       onaangenaam|
|   2170|1727689425|       onaangenaam|
|   2170|1727689425|       onaangenaam|
|   2170|1727689425|       onaangenaam|
|   2660|1595608826|         aangenaam|
|   2600|1587845201|       onaangenaam|
|   2018|1577124744|       onaangenaam|
|   2020|1577045792|       onaangenaam|
|   2100|1583248713|       onaangenaam|
|   2018|1577131973|       onaangenaam|
|   2050|1579113249|       onaangenaam|
|   2000|1577134598|       onaangenaam|
|   2050|1577790936|       onaangenaam|
|   2610|1590106342|       onaangenaam|
|   2610|1590490186|       onaangenaam|
|   2610|1590947279|         aangenaam|
|   2600|1584365486|         aangenaam|
|   2030|1577040467|          neutraal|


In [11]:
weather_json.coalesce(1).write.format("delta").mode("overwrite").save("./spark-warehouse/dim_weather")

In [12]:
spark.stop()