# Header splitters

In [17]:
import re
def head_split(header):
    positions = []
    items = []
    for m in re.finditer(r"\S+", header):
        position, item = m.start(), m.group()
        if item != "#":
            positions.append(position)
            items.append(item)
    # correct for initial #
    positions[0] = 0

    return positions, items

In [18]:
from enum import Enum
class ColumnTypes(Enum):
    TimeStampType = 0
    String = 1
    Integer = 2
    Float = 3

# Row splitters

In [19]:
from datetime import datetime as dt
from numba import jit

def cast_string_value_to_type(var_value, column_type):
    result = None

    if var_value == "":
        return result

    if column_type == ColumnTypes.TimeStampType:
        result = dt.strptime(var_value, "%Y-%m-%d %H:%M:%S")

    if column_type == ColumnTypes.Float:
        result = float(var_value)

    if column_type == ColumnTypes.String:
        result = str(var_value)

    if column_type == ColumnTypes.Integer:
        result = int(var_value)

    return result


def row_split(r, positions, items, column_names, column_types):
    result = []
    num_items = len(positions)
    num_requested_items = len(column_names)

    for i in range(0, num_items - 1):

        if len(result) == num_requested_items:
            break

        if items[i] not in column_names:
            continue

        var_value = r[positions[i] : positions[i + 1]].strip()

        result.append(cast_string_value_to_type(var_value, column_types[items[i]]))

    if items[-1] in column_names:
        var_value = r[positions[num_items - 1] :].strip()

        result.append(cast_string_value_to_type(var_value, column_types[items[-1]]))

    return result



# File parser

In [20]:
class FileParser:
    def __init__(
        self,
        file_path,
        spark_context,
        header_symbol,
        header_estimated_length,
        column_names,
        column_types,
    ):
        self.rdd = spark_context.textFile(file_path)
        self.file_path = file_path

        self.header_symbol = header_symbol
        self.header_estimated_length = header_estimated_length
        self.column_names = column_names
        self.column_types = column_types

        self.first_rows = self.rdd.take(self.header_estimated_length)

    def get_temporal_extend(self):
        self.header_extract_period_function(self.first_rows)

    def parse(self):

        # assume contained in the first self.length_header rows
        if self.first_rows[-1] == self.header_symbol:
            raise ValueError(
                "Estimated length of the header is too small, please increase it"
            )

        num_header_rows = 0
        for row in self.first_rows:
            if row[0] == self.header_symbol:
                num_header_rows += 1

        header = self.first_rows[num_header_rows - 1]
        positions, items = head_split(header)
        all_items_found = all(item in items for item in self.column_names)

        if not all_items_found:
            raise ValueError("Not all required columns ar found for " + self.file_path)

        column_names = self.column_names
        column_types = self.column_types
        header_symbol = self.header_symbol
        return self.rdd.filter(lambda line: line[0] != header_symbol && line[0]).map(
            lambda x: row_split(x, positions, items, column_names, column_types)
        )

# Utilities

In [21]:
def pairwise_union(dataframes_list):
    while len(dataframes_list) > 1:
        unified_df = [
            df1.union(df2).distinct()
            for df1, df2 in zip(dataframes_list[::2], dataframes_list[1::2])
        ]
        if len(dataframes_list) > 1 and len(unified_df) % 2 == 1:
            unified_df[-1] = unified_df[-1].union(dataframes_list[-1]).distinct()
        dataframes_list = unified_df
    return unified_df[0]

# Test application

In [22]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import (
    FloatType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)

In [23]:
spark = SparkSession.builder.appName('gdd').getOrCreate()
#spark.conf.set("spark.executor.memory", "10g")
#spark.conf.set("spark.executor.cores", "4")
#spark_context = spark.sparkContext
#n_workers =  len([executor.host() for executor in spark_context.statusTracker().getExecutorInfos() ]) -1
#print(spark.SparkConf())

In [24]:
spark = SparkSession.builder.appName("compute_heat_waves").getOrCreate()
spark_context = spark.sparkContext

column_names = ["DTG", "NAME", "TX_DRYB_10"]
column_types = {
    column_names[0]: ColumnTypes.TimeStampType,
    column_names[1]: ColumnTypes.String,
    column_names[2]: ColumnTypes.Float,
}
data_frame_schema = [
    StructField(column_names[0], TimestampType(), True),
    StructField(column_names[1], StringType(), True),
    StructField(column_names[2], FloatType(), True),
]
schema = StructType(data_frame_schema)

In [25]:
from os.path import isfile, join
import os
from os import listdir

dir_path = "../../data/uncompressed"
abs_dir_path = os.path.abspath(dir_path)
all_files_path = [
    join(abs_dir_path, f)
    for f in listdir(abs_dir_path)
    if isfile(join(abs_dir_path, f))
]

In [27]:
from pyspark.sql.functions import count, to_date, countDistinct, col, row_number
from pyspark.sql.functions import max as pyspark_max
from pyspark.sql.functions import min as pyspark_min
from pyspark.sql.window import Window
dfs = []
for iteration, file_path in enumerate(all_files_path):
    try:
        file_parser = FileParser(
            file_path=file_path,
            spark_context=spark_context,
            header_symbol="#",
            header_estimated_length=100,
            column_names=column_names,
            column_types=column_types,
        )
        df = file_parser.parse().toDF(schema=schema).filter("NAME=='De Bilt'")
        print("Iteration {}".format(iteration))
        dfs.append(df)
    except ValueError:
        print("error found for file {}, iteration {}".format(file_path, iteration))

Iteration 0
Iteration 1


# Set parameters

In [28]:
temperature = 15
duration = 5
max_temperature = 16

## Perform the union of dataframes

In [29]:
union_df = pairwise_union(dfs)

## Add a dates column, count the dinstict timestamps within the day, find the maximum temp, order by dates

In [30]:
union_df = (union_df.withColumn("Dates", to_date(col("DTG"))).groupBy("Dates").agg(countDistinct("DTG"), pyspark_max("TX_DRYB_10"), pyspark_min("TX_DRYB_10")).orderBy("Dates"))

In [31]:
#union_df_final.count()

## Convert to pandas

In [32]:
union_df_final_pd = union_df.toPandas().set_index("Dates")

## Save to csv

In [32]:
union_df_final_pd.to_csv("reduced_panda_df.csv", header=True)

## now print the dates

In [33]:
union_df_final_pd

Unnamed: 0_level_0,count(DTG),max(TX_DRYB_10),min(TX_DRYB_10)
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-04-01,143,13.200000,-0.2
2003-04-02,144,10.000000,3.6
2003-04-03,144,9.300000,0.5
2003-04-04,144,10.800000,-2.2
2003-04-05,144,11.600000,5.3
...,...,...,...
2003-05-28,144,23.700001,9.3
2003-05-29,144,25.900000,10.6
2003-05-30,144,27.500000,12.8
2003-05-31,144,25.100000,13.5


In [20]:
from datetime import timedelta
is_high_temp = False
high_temp_days_start = None
high_temp_days_end = None
running_max_temp = -1000.0
for index, row in union_df_final_pd.iterrows():
    if row['last_days_min_temp'] > temperature and not is_high_temp:
        running_max_temp = max(running_max_temp, row['last_days_max_temp']) 
        high_temp_days_start = index - timedelta(duration)
        high_temp_days_end = index
        is_high_temp = True
    if row['last_days_min_temp'] < temperature and is_high_temp:
        high_temp_days_end = index
        if running_max_temp > 35:
            heat_wave_end.append([high_temp_days_start,high_temp_days_end ])
        running_max_temp= -1000.0
        high_temp_days_start = None
        high_temp_days_end = None
        is_high_temp = False

#If we still are on a heat wave
if is_in_heat_wave:
    heat_wave_end.append(union_df_final_pd.index[-1])
        
print(heat_wave_start)
print(heat_wave_end)

[]
[datetime.date(2003, 6, 1)]
