In [None]:
%pip install pandas
%pip install pyspark==3.5.0
%pip install -q findspark
%pip install py4j
%pip install pyarrow
%pip install numpy

In [1]:
# for local operations
import os
os.environ['JAVA_HOME'] = "C:\\Program Files\\Java\\jdk-11"

import pandas as pd
from pyspark.sql import functions as f

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .getOrCreate()
spark

In [3]:
from pyspark.sql import types as T

data_schema = T.StructType([
    T.StructField('contract_id', T.StringType(), False),
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('value', T.DoubleType(), True),
    T.StructField('value_source', T.StringType(), False),
    T.StructField('annotations', T.StringType(), True)
])

df_raw_time_series = spark.read.format('json') \
    .schema(data_schema) \
    .load('project_data/project/raw_time_series/json/part-00000-3992c3d5-97c8-4e7d-8328-b7c8379135e8-c000.json')
df_raw_time_series.show()

+--------------------+-------------------+-------------------+------------+--------------------+
|         contract_id|          timestamp|              value|value_source|         annotations|
+--------------------+-------------------+-------------------+------------+--------------------+
|  04 _02 _111 _CHR12|2023-01-01 03:15:00|0.01887007418980177| measurement|{"region":"Europe...|
|  04_02 _ 111 _CHR12|2023-01-01 16:15:00|           1.266225| measurement|{"region":"Europe...|
|  04_02_ 111 _CHR12 |2023-01-01 23:45:27|               NULL| measurement|{"region":"Europe...|
|  04 _02 _111 _CHR12|2023-01-02 03:30:00|0.02093419915470171| measurement|{"region":"Europe...|
|   04 _02_111_CHR12 |2023-01-02 09:45:00| 0.0142526590154003| measurement|{"region":"Europe...|
|  04_02 _ 111 _CHR12|2023-01-02 18:45:00|0.07874463371047623| measurement|{"region":"Europe...|
|   04 _02_111_CHR12 |2023-01-03 00:45:00| 0.1950477376343327| measurement|{"region":"Europe...|
|  04_02_ 111 _CHR12 |2023-01-

In [4]:
data_schema = T.StructType([
    T.StructField('contract_id', T.StringType(), False),
    T.StructField('target_local_start_timestamp', T.TimestampType(), False),
    T.StructField('target_local_end_timestamp', T.TimestampType(), False),
    T.StructField('tariff_name', T.StringType(), True),
    T.StructField('charge_type', T.StringType(), False),
    T.StructField('price', T.DoubleType(), True)
])

df_customer_tariff = spark.read.format('json') \
    .schema(data_schema) \
    .load('project_data/project/customer_tariff/json/part-00000-44d37ba9-fccd-40f4-b939-d8b47af99c95-c000.json')
df_customer_tariff.show()

+---------------+----------------------------+--------------------------+-----------------+-----------+------+
|    contract_id|target_local_start_timestamp|target_local_end_timestamp|      tariff_name|charge_type| price|
+---------------+----------------------------+--------------------------+-----------------+-----------+------+
|04_02_111_CHR39|         2022-12-01 02:00:00|       2023-02-16 02:00:00|Electric Constant|        buy|0.1166|
|04_02_111_CHR39|         2023-02-16 02:00:00|       2024-05-19 03:00:00|Electric Constant|        buy|0.1797|
|04_02_111_CHR39|         2024-05-19 03:00:00|       2024-08-31 03:00:00|     Eco Electric|        buy|0.1911|
|04_02_111_CHR39|         2024-08-31 03:00:00|       2024-10-09 03:00:00|Electric Constant|        buy|0.1938|
|04_02_111_CHR39|         2024-10-09 03:00:00|       2024-11-09 02:00:00|     Eco Electric|        buy| 0.201|
|04_02_111_CHR39|         2024-11-09 02:00:00|       2024-12-11 02:00:00| Digital Electric|        buy|0.2447|
|

In [5]:
df_raw_time_series.printSchema()
df_customer_tariff.printSchema()

root
 |-- contract_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- value: double (nullable = true)
 |-- value_source: string (nullable = true)
 |-- annotations: string (nullable = true)

root
 |-- contract_id: string (nullable = true)
 |-- target_local_start_timestamp: timestamp (nullable = true)
 |-- target_local_end_timestamp: timestamp (nullable = true)
 |-- tariff_name: string (nullable = true)
 |-- charge_type: string (nullable = true)
 |-- price: double (nullable = true)



Data clean-up progress - if ruled more than once, all values will be turned to NULL

In [6]:
df_raw_time_series = (
    df_raw_time_series
    .withColumn('contract_id', f.regexp_replace(f.col('contract_id'), ' ', ''))
    .withColumn('value_source', f.when(f.col('value').isNull(), 'missing').otherwise(f.col('value_source')))
    .withColumn('timestamp',
                f.from_unixtime(f.round(f.unix_timestamp('timestamp') / (15 * 60)) * (15 * 60)))
    .withColumn('timestamp',f.date_format('timestamp', 'yyyy-MM-dd HH:mm'))
)
df_raw_time_series.show()

+---------------+----------------+-------------------+------------+--------------------+
|    contract_id|       timestamp|              value|value_source|         annotations|
+---------------+----------------+-------------------+------------+--------------------+
|04_02_111_CHR12|2023-01-01 03:15|0.01887007418980177| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-01 16:15|           1.266225| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-01 23:45|               NULL|     missing|{"region":"Europe...|
|04_02_111_CHR12|2023-01-02 03:30|0.02093419915470171| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-02 09:45| 0.0142526590154003| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-02 18:45|0.07874463371047623| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-03 00:45| 0.1950477376343327| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023-01-03 01:00| 0.1494432740331963| measurement|{"region":"Europe...|
|04_02_111_CHR12|2023