# Header tokenizers

In [3]:
import re
def head_tokenizer(string_value, header_charater):
    """Tokenize a row composed of tokens separated by spaces, disregarding tokens starting with a specific charter

    Args:
        string_value (str): The string to tokenize.
        header_charater (str): The charter defining an header row.

    Returns:
        item_positions (dict): A dictionary with tokens as keys and positions of the tokens as values.
    """

    positions = []
    items = []
    for m in re.finditer(r"\S+", string_value):
        position, item = m.start(), m.group()
        if item != header_charater:
            items.append(item)
            positions.append(position)

    item_positions = dict()
    for i in range(len(items) - 1):
        if i == 0:
            item_positions[items[i]] = [0, positions[i + 1] - 1]
            continue
        item_positions[items[i]] = [positions[i], positions[i + 1] - 1]

    item_positions[items[-1]] = positions[-1], len(string_value)

    return item_positions

In [4]:
from enum import Enum
class ValueTypes(Enum):
    TimeStamp = 0
    String = 1
    Integer = 2
    Float = 3

# Row tokenizers

In [5]:
from datetime import datetime as dt

def cast_string_value_to_type(string_value, string_type):
    """Casts a string to a value, as defined by the value type.

    Args:
        string_value (str): The string to cast.
        string_type (ValueTypes): The type of the casting operation.
    """

    result = None

    if string_value == "":
        return result

    if string_type == ValueTypes.TimeStamp:
        result = dt.strptime(string_value, "%Y-%m-%d %H:%M:%S")

    if string_type == ValueTypes.Float:
        result = float(string_value)

    if string_type == ValueTypes.String:
        result = str(string_value)

    if string_type == ValueTypes.Integer:
        result = int(string_value)

    return result


def row_tokenizer(string_value, item_positions, column_names, column_types):
    """Finds the tokens on a row at specified positions.

    Args:
        string_value (str): The string to tokenize.
        item_positions (dict): A dictionary with the tokens as keys and the positions of the tokens as values.
        column_names (list): The column names of each token
        column_types (list): The column types of each token
    """

    result = []
    num_requested_items = len(column_names)

    for column_name in column_names:

        if len(result) == num_requested_items:
            break

        if column_name not in item_positions.keys():
            continue

        var_value = string_value[
            item_positions[column_name][0] : item_positions[column_name][1]
        ].strip()

        result.append(cast_string_value_to_type(var_value, column_types[column_name]))

    return result

# File parser

In [6]:
class FileParser:
    """Class parsing a large text file with PySpark"""

    def __init__(
        self,
        file_path,
        spark_context,
        header_charater,
        filter_column,
        filter_value,
        header_estimated_length,
        column_names,
        column_types,
    ):
        """Constructor

        Args:
            file_path (str): The file path.
            spark_context (sparkContext): The spark context.
            header_charater (str): The symbol defining an header row
            filter_column (str): The column to use for filtering the file rows
            filter_value (str): The value to use for filtering the file rows
            header_estimated_length (int): An estimated initial header length.
            column_names (list): The names of the columns to extract.
            column_types (list): The column types.
        """

        self.rdd = spark_context.textFile(file_path)
        self.file_path = file_path

        self.header_charater = header_charater
        self.header_estimated_length = header_estimated_length
        self.column_names = column_names
        self.column_types = column_types
        self.filter_column = filter_column
        self.filter_value = filter_value

        self.first_rows = self.rdd.take(self.header_estimated_length)

    def parse(self):
        """Implements the parsing operations.
        First the number of header rows is determined by loading only the estimated number of row headers and the last
        row containing the column names is tokenized.


        """

        # assume contained in the first self.length_header rows
        if self.first_rows[-1] == self.header_charater:
            raise ValueError(
                "Estimated length of the header is too small, please increase it"
            )

        num_header_rows = 0
        for row in self.first_rows:
            if row[0] == self.header_charater:
                num_header_rows += 1

        header = self.first_rows[num_header_rows - 1]
        item_positions = head_tokenizer(header, self.header_charater)

        all_items_found = all(
            item in item_positions.keys() for item in self.column_names
        )
        if not all_items_found:
            raise ValueError("Not all required columns are found for " + self.file_path)

        header_symbol = self.header_charater
        column_names = self.column_names
        column_types = self.column_types
        header_symbol = self.header_charater
        filter_column = self.filter_column
        filter_value = self.filter_value

        return self.rdd.filter(
            lambda line: line[0] != header_symbol
            and line[
                item_positions[filter_column][0] : item_positions[filter_column][1]
            ].strip()
            == filter_value
        ).map(lambda x: row_tokenizer(x, item_positions, column_names, column_types))

# Utilities

In [7]:
def pairwise_union(dataframes_list):
    while len(dataframes_list) > 1:
        unified_df = [
            df1.union(df2).distinct()
            for df1, df2 in zip(dataframes_list[::2], dataframes_list[1::2])
        ]
        if len(dataframes_list) > 1 and len(unified_df) % 2 == 1:
            unified_df[-1] = unified_df[-1].union(dataframes_list[-1]).distinct()
        dataframes_list = unified_df
    return unified_df[0]

# Test application

In [8]:
import os
pyspark_submit_args = "--executor-memory 8g pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args
os.environ['JAVA_HOME'] = 'D:\Apps\java_jre_1_8_0_301'

In [9]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.types import (
    FloatType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)
spark = SparkSession.builder.appName("compute_heat_waves").getOrCreate()
spark_context = spark.sparkContext

In [10]:
column_names = ["DTG", "NAME", "TX_DRYB_10"]
column_types = {
    column_names[0]: ValueTypes.TimeStamp,
    column_names[1]: ValueTypes.String,
    column_names[2]: ValueTypes.Float,
}
data_frame_schema = [
    StructField(column_names[0], TimestampType(), True),
    StructField(column_names[1], StringType(), True),
    StructField(column_names[2], FloatType(), True),
]
schema = StructType(data_frame_schema)

In [13]:
from os.path import isfile, join
import os
from os import listdir

dir_path = "../../data/raw_data"
abs_dir_path = os.path.abspath(dir_path)
all_files_path = [
    join(abs_dir_path, f)
    for f in listdir(abs_dir_path)
    if isfile(join(abs_dir_path, f))
]

In [14]:
from pyspark.sql.functions import count, to_date, countDistinct, col, row_number
from pyspark.sql.functions import max as pyspark_max
from pyspark.sql.functions import min as pyspark_min
from pyspark.sql.window import Window
dfs = []
for iteration, file_path in enumerate(all_files_path):
    try:
        file_parser = FileParser(
            file_path=file_path,
            spark_context=spark_context,
            header_charater="#",
            filter_column="NAME",
            filter_value="De Bilt",
            header_estimated_length=100,
            column_names=column_names,
            column_types=column_types,
        )
        spark.read.format("csv").option("header", "true").option("delimiter", "\t").schema(schema).load(file_path)
        
        
        df = file_parser.parse().toDF(schema=schema)
        print("Iteration {}".format(iteration))
        dfs.append(df)
    except ValueError:
        print("error found for file {}, iteration {}".format(file_path, iteration))

Iteration 0


## Perform the union of dataframes

In [29]:
#union_df= None
#for i,df in enumerate(dfs):
    
#    if i==0:
#        union_df = df
#        continue
        
#    union_df = union_df.union(df).distinct()
    
union_df = pairwise_union(dfs)

## Add a dates column, count the dinstict timestamps within the day, find the maximum temp, order by dates

In [25]:
union_df = (union_df.withColumn("Dates", to_date(col("DTG"))).groupBy("Dates").agg(countDistinct("DTG"), pyspark_max("TX_DRYB_10"), pyspark_min("TX_DRYB_10")).orderBy("Dates"))

In [26]:
#union_df_final.count()

## Convert to pandas

In [27]:
union_df_final_pd = union_df.toPandas().set_index("Dates")

## Save to csv

In [20]:
union_df_final_pd.to_csv("reduced_panda_df.csv", header=True)

## now print the dates

In [21]:
union_df_final_pd

Unnamed: 0_level_0,count(DTG),max(TX_DRYB_10),min(TX_DRYB_10)
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-04-01,143,13.200000,-0.2
2003-04-02,144,10.000000,3.6
2003-04-03,144,9.300000,0.5
2003-04-04,144,10.800000,-2.2
2003-04-05,144,11.600000,5.3
...,...,...,...
2019-03-28,144,11.200000,3.2
2019-03-29,144,17.100000,0.4
2019-03-30,144,18.299999,4.0
2019-03-31,138,12.500000,3.7


In [None]:
temperature = 25
duration = 5
duration_max_temperature = 3
max_temperature = 30

In [None]:
from collections import deque
duration = 5
index_deque = deque(maxlen=duration)
are_last_days_hot = False
start_hot_days = None
end_hot_days = None
num_tropical_days = 0
tropical_days = []
max_temp_hot_days = -1000
heat_waves_durations = []
tropical_days_in_heat_wave = []
for date in union_df_final_pd.index:
    index_deque.append(date)
    if len(index_deque) == duration:
        df_slice = df.loc[index_deque]
        min_temp = np.min(df_slice['max(TX_DRYB_10)'].values)
        max_temp = np.max(df_slice['max(TX_DRYB_10)'].values)

        if min_temp>temperature  and not are_last_days_hot:
            num_tropical_days = num_tropical_days + len(df_slice.loc[df['max(TX_DRYB_10)']>max_temperature].values)
            start_hot_days = index_deque[0]
            end_hot_days = index_deque[-1]
        if min_temp>temperature and are_last_days_hot:
            end_hot_days = index_deque[-1]
            if (df_slice.loc[index_deque[-1]]['max(TX_DRYB_10)']>max_temperature):
                num_tropical_days = num_tropical_days + 1
        if min_temp<=temperature and are_last_days_hot:
            are_last_days_hot = False
            if num_tropical_days >duration_max_temperature:
                tropical_days_in_heat_wave.append(num_tropical_days)
                heat_waves_durations.append([start_hot_days, end_hot_days])
                
print(heat_wave_start)
print(heat_wave_end)