In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://dlcdn.apache.org/spark/
!wget -q https://dlcdn.apache.org/spark/spark-3.4.4/spark-3.4.4-bin-hadoop3.tgz
!tar xf spark-3.4.4-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j
import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.4.4-bin-hadoop3"
import findspark
findspark.init()
findspark.find()
import pyspark

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,628 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,845 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [4,266 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,150 

In [3]:
import pandas as pd
from pyspark.sql import functions as f

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .getOrCreate()
spark

In [14]:
from pyspark.sql import types as T

data_schema = T.StructType([
    T.StructField('contract_id', T.StringType(), False),
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('value', T.DoubleType(), True),
    T.StructField('value_source', T.StringType(), False),
    T.StructField('annotations', T.StringType(), True)
])

df_raw_time_series = spark.read.format('json') \
    .schema(data_schema) \
    .load('/content/drive/MyDrive/Colab Notebooks/project/raw_time_series/json')
df_raw_time_series = spark.createDataFrame(df_raw_time_series.rdd, schema=data_schema)
df_raw_time_series.printSchema()
df_raw_time_series.show()

root
 |-- contract_id: string (nullable = false)
 |-- timestamp: timestamp (nullable = false)
 |-- value: double (nullable = true)
 |-- value_source: string (nullable = false)
 |-- annotations: string (nullable = true)

+-------------------+-------------------+--------------------+------------+--------------------+
|        contract_id|          timestamp|               value|value_source|         annotations|
+-------------------+-------------------+--------------------+------------+--------------------+
| 04_02_111 _ CHR12 |2023-01-01 06:00:00| 0.02591860654732236| measurement|{"region":"Europe...|
| 04 _02_111 _CHR12 |2023-01-01 17:00:00| 0.07385444264936832| measurement|{"region":"Europe...|
| 04_02_111 _ CHR12 |2023-01-01 17:30:22| 0.08180149515221906| measurement|{"region":"Europe...|
| 04 _02_111 _CHR12 |2023-01-01 21:30:00| 0.08670661371854547| measurement|{"region":"Europe...|
|04 _ 02 _111_CHR12 |2023-01-02 00:30:00| 0.03597601881331959| measurement|{"region":"Europe...|
|04 

Data clean-up progress - if ruled more than once, all values will be turned to NULL

In [15]:
df_raw_time_series = (
    df_raw_time_series
    .withColumn('contract_id', f.regexp_replace(f.col('contract_id'), ' ', ''))
    .withColumn('value_source', f.when(f.col('value').isNull(), 'missing').otherwise(f.col('value_source')))
    .withColumn('timestamp',
                f.from_unixtime(f.round(f.unix_timestamp('timestamp') / (15 * 60)) * (15 * 60)))
    .withColumn('timestamp',f.date_format('timestamp', 'yyyy-MM-dd HH:mm'))
)
df_raw_time_series.show()

+---------------+----------------+--------------------+------------+--------------------+
|    contract_id|       timestamp|               value|value_source|         annotations|
+---------------+----------------+--------------------+------------+--------------------+
|04_02_111_CHR12|2023-01-01 06:00| 0.02591860654732236| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-01 17:00| 0.07385444264936832| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-01 17:30| 0.08180149515221906| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-01 21:30| 0.08670661371854547| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-02 00:30| 0.03597601881331959| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-02 05:30| 0.03638379308965683| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-03 10:45|            0.931575| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-03 18:30| 0.08816273670606922| measurement|{"region":"Europe...|
|04_02_111

Extracting a new `region` column from `annotations`

In [16]:
json_schema = T.StructType([
    T.StructField('region', T.StringType(), False)
])

df_raw_time_series = (
    df_raw_time_series
    .withColumn('region', f.from_json(f.col('annotations'), json_schema).getField('region'))
)
df_raw_time_series.show()

# check-up to see other regions
df_raw_time_series.select('region').distinct().show()

+---------------+----------------+--------------------+------------+--------------------+-------------+
|    contract_id|       timestamp|               value|value_source|         annotations|       region|
+---------------+----------------+--------------------+------------+--------------------+-------------+
|04_02_111_CHR12|2023-01-01 06:00| 0.02591860654732236| measurement|{"region":"Europe...|Europe/Berlin|
|04_02_111_CHR12|2023-01-01 17:00| 0.07385444264936832| measurement|{"region":"Europe...|Europe/Berlin|
|04_02_111_CHR12|2023-01-01 17:30| 0.08180149515221906| measurement|{"region":"Europe...|Europe/Berlin|
|04_02_111_CHR12|2023-01-01 21:30| 0.08670661371854547| measurement|{"region":"Europe...|Europe/Berlin|
|04_02_111_CHR12|2023-01-02 00:30| 0.03597601881331959| measurement|{"region":"Europe...|Europe/Berlin|
|04_02_111_CHR12|2023-01-02 05:30| 0.03638379308965683| measurement|{"region":"Europe...|Europe/Berlin|
|04_02_111_CHR12|2023-01-03 10:45|            0.931575| measurem

Customers with invalid regions will be removed from the database and saved on disk in a separate location.
- Suppose the regions respect the following format: `continent/city`

In [17]:
df_invalid_region = (
    df_raw_time_series
    .filter((f.col('region').isNull()) |
            (f.trim(f.col('region')) == '') |
            (f.regexp_extract(f.col('region'), r'^[A-Za-z]+/[A-Za-z]+$', 0) == ''))
)

df_invalid_region.show()
df_invalid_region.write.json('/content/drive/MyDrive/Colab Notebooks/project/invalid_regions', mode="overwrite")

+---------------+----------------+--------------------+------------+--------------------+--------+
|    contract_id|       timestamp|               value|value_source|         annotations|  region|
+---------------+----------------+--------------------+------------+--------------------+--------+
|01_02_155_CHR98|2023-01-01 00:15| 0.24727942049503326| measurement|{"region":"WakaWa...|WakaWaka|
|01_02_155_CHR98|2023-01-01 01:00|                NULL|     missing|{"region":"WakaWa...|WakaWaka|
|01_02_155_CHR98|2023-01-01 01:30| 0.09512702375650406| measurement|{"region":"WakaWa...|WakaWaka|
|01_02_155_CHR98|2023-01-01 02:00| 0.26903796195983887| measurement|{"region":"WakaWa...|WakaWaka|
|01_02_155_CHR98|2023-01-01 03:45| 0.10639405995607376| measurement|{"region":"WakaWa...|WakaWaka|
|01_02_155_CHR98|2023-01-01 04:30|  0.1394430249929428| measurement|{"region":"WakaWa...|WakaWaka|
|01_02_155_CHR98|2023-01-01 06:00| 0.05136726051568985| measurement|{"region":"WakaWa...|WakaWaka|
|01_02_155

Extract date from the `timestamp` column in a new `utc_date` column

In [18]:
df_raw_time_series = (
    df_raw_time_series
    .withColumn('utc_date',f.date_format('timestamp', 'yyyy-MM-dd'))
)
df_raw_time_series.show()

+---------------+----------------+--------------------+------------+--------------------+-------------+----------+
|    contract_id|       timestamp|               value|value_source|         annotations|       region|  utc_date|
+---------------+----------------+--------------------+------------+--------------------+-------------+----------+
|04_02_111_CHR12|2023-01-01 06:00| 0.02591860654732236| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|
|04_02_111_CHR12|2023-01-01 17:00| 0.07385444264936832| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|
|04_02_111_CHR12|2023-01-01 17:30| 0.08180149515221906| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|
|04_02_111_CHR12|2023-01-01 21:30| 0.08670661371854547| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|
|04_02_111_CHR12|2023-01-02 00:30| 0.03597601881331959| measurement|{"region":"Europe...|Europe/Berlin|2023-01-02|
|04_02_111_CHR12|2023-01-02 05:30| 0.03638379308965683| measurement|{"region":"E

Calculate local date for the date and time of `timestamp`, based on the region, in a new `local_timestamp` column

In [19]:
df_raw_time_series = df_raw_time_series.withColumn(
    "local_timestamp",
    f.from_utc_timestamp(df_raw_time_series["timestamp"], df_raw_time_series["region"])
)

df_raw_time_series.show()

+---------------+----------------+--------------------+------------+--------------------+-------------+----------+-------------------+
|    contract_id|       timestamp|               value|value_source|         annotations|       region|  utc_date|    local_timestamp|
+---------------+----------------+--------------------+------------+--------------------+-------------+----------+-------------------+
|04_02_111_CHR12|2023-01-01 06:00| 0.02591860654732236| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|2023-01-01 07:00:00|
|04_02_111_CHR12|2023-01-01 17:00| 0.07385444264936832| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|2023-01-01 18:00:00|
|04_02_111_CHR12|2023-01-01 17:30| 0.08180149515221906| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|2023-01-01 18:30:00|
|04_02_111_CHR12|2023-01-01 21:30| 0.08670661371854547| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|2023-01-01 22:30:00|
|04_02_111_CHR12|2023-01-02 00:30| 0.03597601881331959|

Extract date from the `local_timestamp` column in a new `local_date` column

In [20]:
df_raw_time_series = df_raw_time_series.withColumn(
    "local_date",
    f.to_date(df_raw_time_series["local_timestamp"])
)

df_raw_time_series.show()


+---------------+----------------+--------------------+------------+--------------------+-------------+----------+-------------------+----------+
|    contract_id|       timestamp|               value|value_source|         annotations|       region|  utc_date|    local_timestamp|local_date|
+---------------+----------------+--------------------+------------+--------------------+-------------+----------+-------------------+----------+
|04_02_111_CHR12|2023-01-01 06:00| 0.02591860654732236| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|2023-01-01 07:00:00|2023-01-01|
|04_02_111_CHR12|2023-01-01 17:00| 0.07385444264936832| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|2023-01-01 18:00:00|2023-01-01|
|04_02_111_CHR12|2023-01-01 17:30| 0.08180149515221906| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|2023-01-01 18:30:00|2023-01-01|
|04_02_111_CHR12|2023-01-01 21:30| 0.08670661371854547| measurement|{"region":"Europe...|Europe/Berlin|2023-01-01|2023-01-01

Extracting from the `annotations` column of electric vehicle consumption (EV), battery (BATTERY_IN) and consumption sent to the electric network (GRID_SELL) in the `sent_to_ev`, `sent_to_battery` and `sent_to_grid` columns. If the value is missing, consumption is 0.

Extraction from the `annotations` of the energy received from the solar panels (PV) and battery (BATTERY_OUT) in the `received_from_pv` and `received_from_battery` columns. If the value is missing, the energy received is 0.

In [41]:
# to show other features (and not only the region)
df_raw_time_series.select("annotations").show(300, truncate=False)

+--------------------------------------------------------------------------------------+
|annotations                                                                           |
+--------------------------------------------------------------------------------------+
|{"region":"Europe/Berlin"}                                                            |
|{"region":"Europe/Berlin"}                                                            |
|{"region":"Europe/Berlin"}                                                            |
|{"region":"Europe/Berlin"}                                                            |
|{"region":"Europe/Berlin"}                                                            |
|{"region":"Europe/Berlin"}                                                            |
|{"region":"Europe/Berlin"}                                                            |
|{"region":"Europe/Berlin"}                                                            |
|{"region":"Europe/Be

In [None]:
# JSON schema for the events list - this includes every feature mentioned
json_schema = T.StructType([
    T.StructField("events", T.StructType([
        T.StructField("EV", T.StringType()),
        T.StructField("PV", T.StringType()),
        T.StructField("BATTERY_IN", T.StringType()),
        T.StructField("BATTERY_OUT", T.StringType()),
        T.StructField("GRID_SELL", T.StringType()),
    ]))
])

# extract the annotations in a column
df_raw_time_series = (
    df_raw_time_series.withColumn(
      "annotation_json",
      f.from_json(f.col("annotations"), json_schema)
))

# then get the values from the "events" part
df_raw_time_series = (
    df_raw_time_series
    .withColumn("sent_to_ev",
      f.coalesce(f.col("annotation_json.events.EV").cast(T.DoubleType()), f.lit(0)))
    .withColumn("received_from_pv",
      f.coalesce(f.col("annotation_json.events.PV").cast(T.DoubleType()), f.lit(0)))
    .withColumn("sent_to_battery",
      f.coalesce(f.col("annotation_json.events.BATTERY_IN").cast(T.DoubleType()), f.lit(0)))
    .withColumn("received_from_battery",
      f.coalesce(f.col("annotation_json.events.BATTERY_OUT").cast(T.DoubleType()), f.lit(0)))
    .withColumn("sent_to_grid",
      f.coalesce(f.col("annotation_json.events.GRID_SELL").cast(T.DoubleType()), f.lit(0)))
)

# drop the annotation column
df_raw_time_series = (
    df_raw_time_series
    .drop("annotation_json")
)

df_raw_time_series.show(300, truncate=False)


+---------------+----------------+--------------------+------------+-------------+----------+-------------------+----------+-----------------+-----------------+-----------------+----------------+---------------------+
|contract_id    |timestamp       |value               |value_source|region       |utc_date  |local_timestamp    |local_date|sent_to_ev       |sent_to_battery  |sent_to_grid     |received_from_pv|received_from_battery|
+---------------+----------------+--------------------+------------+-------------+----------+-------------------+----------+-----------------+-----------------+-----------------+----------------+---------------------+
|04_02_111_CHR12|2023-01-01 06:00|0.02591860654732236 |measurement |Europe/Berlin|2023-01-01|2023-01-01 07:00:00|2023-01-01|0.0              |0.0              |0.0              |0.0             |0.0                  |
|04_02_111_CHR12|2023-01-01 17:00|0.07385444264936832 |measurement |Europe/Berlin|2023-01-01|2023-01-01 18:00:00|2023-01-01|0.0 