# imports

In [1]:
# imports

from config import *
from sys import getsizeof

import os
import sys
import re
import json
import pickle
import datetime
import importlib
import pandas as pd
import dateutil.parser as dp

# datasets

In [None]:
# ! Functions to load datasets (uncomment to run)

def load_datasets(dfs: dict[str: pd.DataFrame]) -> None:
    for name, path in DATASETS_ABS_PATHS.items():
        print(f"{name} {path}")
        if path.endswith(".xlsx"):
            dfs[name] = pd.read_excel(path)
        elif path.endswith(".csv"):
            dfs[name] = pd.read_csv(path, low_memory=False)


def normilize_datasets_timestamps(dfs: dict[str: pd.DataFrame]) -> None:
    dfs["BWQAS"]["measurement_timestamp"] = pd.to_datetime(dfs["BWQAS"]["measurement_timestamp"], format="%m/%d/%Y %I:%M:%S %p")
    dfs["BWSAS"]["measurement_timestamp"] = pd.to_datetime(dfs["BWSAS"]["measurement_timestamp"], format="%m/%d/%Y %I:%M:%S %p")
    dfs["IOTTEMP"]["noted_date"] = pd.to_datetime(dfs["IOTTEMP"]["noted_date"], format="%d-%m-%Y %H:%M")
    dfs["IOT1"]["created_at"] = pd.to_datetime(dfs["IOT1"]["created_at"], format="%Y-%m-%d %H:%M:%S %Z").dt.tz_convert(None)
    dfs["IOT3"]["created_at"] = pd.to_datetime(dfs["IOT3"]["created_at"], format="%Y-%m-%d %H:%M:%S %Z").dt.tz_convert(None)
    dfs["IOT8"]["created_at"] = pd.to_datetime(dfs["IOT8"]["created_at"], format="%Y-%m-%dT%H:%M:%S%z").dt.tz_convert(None)
    dfs["IOT9"]["created_at"] = pd.to_datetime(dfs["IOT9"]["created_at"], format="%Y-%m-%d %H:%M:%S %Z").dt.tz_convert(None)
    # dfs["IOT2"]["created_at"] = pd.to_datetime(dfs["IOT2"]["created_at"].map({'%Y-%m-%d': '%Y-%m-%d', '%d/%m/%Y': '%d/%m/%Y', '%b %d, %Y': '%b %d, %Y'}), format="%Y-%m-%d %H:%M:%S %Z")
    # dfs["IOT4"]["created_at"] = pd.to_datetime(dfs["IOT4"]["created_at"], format="%Y-%m-%d %H:%M:%S %Z")
    # dfs["IOT6"]["created_at"] = pd.to_datetime(dfs["IOT6"]["created_at"], format="%Y-%m-%d %H:%M:%S %Z")
    # dfs["IOT7"]["created_at"] = pd.to_datetime(dfs["IOT7"]["created_at"], format="%Y-%m-%dT%H:%M:%S%z")
    # dfs["IOT10"]["created_at"] = pd.to_datetime(dfs["IOT10"]["created_at"], format="%Y-%m-%d %H:%M:%S %Z")
    # dfs["IOT11"]["created_at"] = pd.to_datetime(dfs["IOT11"]["created_at"], format="%Y-%m-%d %H:%M:%S %Z")
    # dfs["IOT12"]["created_at"] = pd.to_datetime(dfs["IOT12"]["created_at"], format="%Y-%m-%d %H:%M:%S %Z")


def normilize_datasets_columns_names(dfs: dict[str: pd.DataFrame]) -> None:
    for df_name in dfs:
        dfs[df_name].columns = dfs[df_name].columns.str.strip().str.rstrip('-').str.lower()
        dfs[df_name].columns = [re.sub(r'[^\w\s]', '', col.replace(' ', '_').replace('-', '_')) for col in dfs[df_name].columns]


def normilize_datasets_dtypes(dfs: dict[str: pd.DataFrame]) -> None:
    for df_name in dfs:
        dfs[df_name] = dfs[df_name].convert_dtypes()


def normilize_datasets(dfs: dict[str: pd.DataFrame]) -> None:
    normilize_datasets_columns_names(dfs)
    normilize_datasets_dtypes
    normilize_datasets_timestamps(dfs)
    

# dfs = {}
# load_datasets(dfs)
# normilize_datasets(dfs)

In [None]:
# ! Import pickled datasets (and save them if you want) 

# with open(".\\static\\pickles\\dfs.pickle", "wb") as f:
#     pickle.dump(dfs, f)
with open(".\\static\\pickles\\dfs.pickle", "rb") as f:
    dfs = pickle.load(f)


In [None]:
# Function to automatically convert datetime dtype in datasets (works poorly) 

def try_to_datetime(df: pd.DataFrame):
    for col in df.columns:
        if df[col].dtype == "string[python]" and any(char in df[col].iloc[0] for char in ['-', '/', ':']):
            try:
                df[col] = pd.to_datetime(df[col], format="mixed")
            except:
                pass


In [None]:
# Example of loading single dataset (outdated)

# df = pd.read_csv(DATASETS_ABS_PATHS["BWSAS"])
# df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True) # Replace spaces with underscores in column names
# df.Measurement_Timestamp = pd.to_datetime(df.Measurement_Timestamp, format="%m/%d/%Y %I:%M:%S %p")
# df = df.convert_dtypes()
# df_dict = df.to_dict(orient="records")
# df.dtypes

# proto files

## creating proto template

In [None]:
# ! Function to create ptorobuf template

def create_proto_template(package_name: str, df: pd.DataFrame):
    d = []
    timestamp_flag = False
    
    for name, dtype in df.dtypes.to_dict().items():
        match dtype:
            case "string":
                d.append(["string", name])
            case "Float64":
                d.append(["float", name])
            case "Int64":
                d.append(["int64", name])
            case "datetime64[ns]":
                timestamp_flag = True
                d.append(["google.protobuf.Timestamp", name])
    
    template = f'\
syntax = "proto3";\n\
{'import "google/protobuf/timestamp.proto";\n' if timestamp_flag else ''}\
\npackage {package_name};\n\
message {package_name}_message {{\n\
'

    template += "\n".join(f"    {e[0]} {e[1]} = {d.index(e) + 1};" for e in d)
    template += "\n}"
    return template

In [None]:
# Function to create ptorobuf template (Does NOT work (yoinked from net))

import pandas as pd
from google.protobuf.descriptor_pb2 import FieldDescriptorProto
from google.protobuf.descriptor_pb2 import FileDescriptorProto
from google.protobuf.compiler.plugin_pb2 import CodeGeneratorRequest
from google.protobuf.compiler.plugin_pb2 import CodeGeneratorResponse

def create_protobuf_template(df):
    # Create a FileDescriptorProto
    file_descriptor_proto = FileDescriptorProto()

    # Create a message type for the DataFrame
    message_type = file_descriptor_proto.message_type.add()
    message_type.name = "DataFrame"

    # Iterate over the columns of the DataFrame
    for column_name, column_type in df.dtypes.items():
        # Create a field descriptor for each column
        field_descriptor = message_type.field.add()
        field_descriptor.name = column_name

        # Map Pandas dtype to Protobuf field type
        if column_type == "int64":
            field_descriptor.type = FieldDescriptorProto.TYPE_INT64
        elif column_type == "float64":
            field_descriptor.type = FieldDescriptorProto.TYPE_DOUBLE
        elif column_type == "bool":
            field_descriptor.type = FieldDescriptorProto.TYPE_BOOL
        elif column_type == "object":
            field_descriptor.type = FieldDescriptorProto.TYPE_STRING
        elif column_type == "datetime64[ns]":
            field_descriptor.type = FieldDescriptorProto.TYPE_STRING
        else:
            field_descriptor.type = FieldDescriptorProto.TYPE_STRING

    # Return the serialized FileDescriptorProto
    return file_descriptor_proto.SerializeToString()

In [None]:
# Example of creating proto template
# package_name = "m_beach"

# protobuf_template = create_proto_template(package_name, df)
# print(protobuf_template)

## compiling proto file

In [None]:
# ! Function to compile proto file from template file (creates {proto_filename}_pb2.py file)
def compile_proto(proto_filename):
    proto_abs_path = os.path.join(PROTOS_FOLDER, proto_filename)
    compile_proto_command = f"{PROTOC_EXE} --python_out=. {proto_abs_path} "
    os.system(compile_proto_command)

In [None]:
# Example of compiling proto file

# compile_proto(proto_filename)

## loading data into python descriptor of proto message 

In [None]:
# ! Util functions for timestamps calculaitons, conversion and validation
def str2unix(dt: str) -> tuple[int, int]:
    # Converts ISO8601 timstamp string into POSIX timestamp tuple (seconds, nanoseconds)
    dt_unix = dp.parse(dt).timestamp()
    
    seconds = int(dt_unix)
    nanos   = int(dt_unix % 1 * 1e9)

    return (seconds, nanos)


def pdTimestamp2unix(dt: pd.Timestamp) -> tuple[int, int]:
    return str2unix(dt.isoformat())


def datetime_valid(dt_str):
    try:
        datetime.datetime.fromisoformat(dt_str)
    except:
        return False
    return True


# json encoder and decoder for timestamp conversion handling
class _JSONDecoder(json.JSONDecoder):
    def __init__(self, *args, **kwargs):
        json.JSONDecoder.__init__(
            self, object_hook=self.object_hook, *args, **kwargs)

    def object_hook(self, obj):
        ret = {}
        for key, value in obj.items():
            if key in {'timestamp', 'whatever'}:
                ret[key] = datetime.fromisoformat(value) 
            else:
                ret[key] = value
        return ret

    
class _JSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (datetime.date, datetime.datetime, pd.Timestamp, )):
            return obj.isoformat()
        return json.JSONEncoder.default(obj)

In [None]:
# ! Functions to fill package_message(s) with data from DataFrame(s)
def fill_package_from_df(package_name: str, df_dict: list[dict], item: int):
    # df_dict - df.to_dict(orient="records"); item - number of record in df
    package = importlib.import_module(PROTOS_FOLDER + "." + package_name + "_pb2")
    package_message = getattr(package, package_name + "_message")()

    for field, value in df_dict[item].items():
        if value is None:
            continue
        if type(value) is pd.Timestamp:
            seconds, nanos = pdTimestamp2unix(value)
            setattr(getattr(package_message, field), "seconds", seconds)
            setattr(getattr(package_message, field), "nanos", nanos)
        else:
            try:
                setattr(package_message, field, value)
            except Exception as e:
                print(f"Exception: {e} | field: {field} | value: {value}")
    
    return package_message


def fill_packages_from_df(package_name: str, df_dict: list[dict]) -> list:
    package = importlib.import_module(PROTOS_FOLDER + "." + package_name + "_pb2")
    package_messages = []
    
    for item in range(len(df_dict)):
        package_message = None
        package_message = getattr(package, package_name + "_message")()
        for field, value in df_dict[item].items():
            
            if value is None:
                continue
            if type(value) is pd.Timestamp:
                seconds, nanos = pdTimestamp2unix(value)
                setattr(getattr(package_message, field), "seconds", seconds)
                setattr(getattr(package_message, field), "nanos", nanos)
            else:
                try:
                    setattr(package_message, field, value)
                except Exception as e:
                    print(f"Exception: {e} | field: {field} | value: {value}")
        
        package_messages.append(package_message)
    
    return package_messages
    


In [None]:
# ! Function to fill package_message with data from dict
def fill_package_from_dict(package_name: str, message: dict):
    # message - dict (json)
    package = importlib.import_module(PROTOS_FOLDER + "." + package_name + "_pb2")
    package_message = getattr(package, package_name + "_message")()

    for field, value in message.items():
        if value is None:
            continue
        if datetime_valid(value):
            seconds, nanos = str2unix(value)
            setattr(getattr(package_message, field), "seconds", seconds)
            setattr(getattr(package_message, field), "nanos", nanos)
        elif type(value) is pd.Timestamp:
            seconds, nanos = pdTimestamp2unix(value)
            setattr(getattr(package_message, field), "seconds", seconds)
            setattr(getattr(package_message, field), "nanos", nanos)
        else:
            try:
                setattr(package_message, field, value)
            except Exception as e:
                print(f"Exception: {e} | field: {field} | value: {value}")
    
    return package_message

In [None]:
# Example of filling message with data from df and dicts with pd.Timestamp and timestamp in str 
# 
# NOTE: json format does not specify timestamp format, so i used most commonly used - ISO8601, 
# using custom JSONEncoder/JSONDecoder

# item = 0
# message = dfs_dict["BWQAS"][item] # dict with pd.Timestamp as timestamp
# message_json = json.dumps(message, cls=_JSONEncoder)

# package_message = fill_package_from_df(package_name, df_dict, item)
# package_message1 = fill_package_from_dict(package_name, message)
# # message = json.loads(message_json) # dict with str ISO8601 as timestamp
# package_message2 = fill_package_from_dict(package_name, json.loads(message_json))

# print(f"Same?: {package_message == package_message1 == package_message2}")
# print(# message, message_json)

## serializing/deserializing + io

In [None]:
# Util funcitons for bit <-> byte conversion and size finding

def to_bits(byte_string: bytes) -> str:
    # Convert byte string into bit string
    return bin(int.from_bytes(byte_string, byteorder='big'))[2:]  # Remove '0b' prefix from binary string


def to_bytes(bit_string: str) -> bytes:
    # Convert bit string into byte string
    return int(bit_string, 2).to_bytes((len(bit_string) + 7) // 8, byteorder='big')


def bitsize(data: bytes | str) -> int:
    if type(data) == bytes:
        return len(to_bits(data))
    elif type(data) == str and set(data) <= {'0', '1'}:
        return len(data)
    else:
        return 0

In [None]:
# ! Serialize/deserialize functions

def proto_serialize(package_message):
    return package_message.SerializeToString()


def proto_deserialize(package_message_serialized, package_name: str):
    package = importlib.import_module(PROTOS_FOLDER + "." + package_name + "_pb2")
    package_class = getattr(package, package_name + "_message")

    package_message = package_class.FromString(package_message_serialized)
    
    return package_message

In [None]:
# ! IO funcitons for package_message 

# writing package_message to the file
def write_package(package_message, package_bin_filename):
    with open(package_bin_filename, "wb") as f:
        package_message_serialized = proto_serialize(package_message)
        f.write(package_message_serialized)
    
    return package_message_serialized
    


# reading package_message from the file
def read_package(package_bin_filename, package_name):
    with open(package_bin_filename, "rb") as f:
        package_message_serialized = f.read()
        
        package_message = proto_deserialize(package_message_serialized, package_name)

    return package_message

In [None]:
# Example usage of serialization, dumping json, bit/byte conversion + size finding

# package_bin_filename = package_name + ".bin"

# package_message_serialized   = write_package(package_message, package_bin_filename)
# package_message_deserialized = read_package(package_bin_filename, package_name)

# package_json_filename = package_name + ".json"
# with open(package_name + ".json", "w") as f:
#     json.dump(message, f, cls=_JSONEncoder)
# with open(package_name + "1.json", "w") as f:
#     f.write(message_json)

# to_bits(package_message_serialized), bitsize(package_message_serialized), to_bytes(to_bits(package_message_serialized)) == package_message_serialized


In [None]:
# Comparison of sizes

# print(f"message size (dict):                {getsizeof(message)}")
# print(f"message_json size (str of json):    {getsizeof(message_json)}")
# print()
# print(f"bin file size:                      {os.path.getsize(package_bin_filename)}")
# print(f"json file size:                     {os.path.getsize(package_json_filename)}")
# print()
# print(f"package_message size:               {getsizeof(package_message)}")
# print(f"package_message_serialized bits:    {bitsize(package_message_serialized)}")
# print()
# print(f"ratio of serialized/json files:     {os.path.getsize(package_json_filename) / os.path.getsize(package_bin_filename)}")
# print(f"package_message == package_message_deserialized: {package_message == package_message_deserialized}")

### etc

In [None]:
# NOTE: package_message have a lot of metadata in their DESCRIPTOR (might be useful)
# print([field.name for field in package_message.DESCRIPTOR.fields])

# compression

## libraries algorithms comparison

In [None]:
# ! Imports of compression libraries 

from collections import Counter

import zlib, gzip, bz2, lzma, lz4.frame, zstd, brotli

In [None]:
# Comparison of compression algorithms (on message serialized with protobuf)

# pb_msg = package_message_serialized

# pb_ratio_zlib     = bitsize(pb_msg) / bitsize(zlib.compress(pb_msg))
# pb_ratio_gzip     = bitsize(pb_msg) / bitsize(gzip.compress(pb_msg))
# pb_ratio_bz2      = bitsize(pb_msg) / bitsize(bz2.compress(pb_msg))
# pb_ratio_lzma     = bitsize(pb_msg) / bitsize(lzma.compress(pb_msg))
# pb_ratio_lz4      = bitsize(pb_msg) / bitsize(lz4.frame.compress(pb_msg))
# pb_ratio_zstd     = bitsize(pb_msg) / bitsize(zstd.compress(pb_msg, 1))
# pb_ratio_brotli   = bitsize(pb_msg) / bitsize(brotli.compress(pb_msg))



# print(f'pb_ratio_zlib:        {pb_ratio_zlib}\
#       \npb_ratio_gzip:        {pb_ratio_gzip}\
#       \npb_ratio_bz2:         {pb_ratio_bz2}\
#       \npb_ratio_lzma:        {pb_ratio_lzma}\
#       \npb_ration_lz4:        {pb_ratio_lz4}\
#       \npb_ration_zstd(1):    {pb_ratio_zstd}\
#       \npb_ration_brotli(11): {pb_ratio_brotli}\
# '
# )

In [None]:
# Comparison of compression algorithms (on message serialized with json)

# j_msg = bytes(message_json, "utf-8")

# j_ratio_zlib      = bitsize(j_msg)  / bitsize(zlib.compress(j_msg))
# j_ratio_gzip      = bitsize(j_msg)  / bitsize(gzip.compress(j_msg))
# j_ratio_bz2       = bitsize(j_msg)  / bitsize(bz2.compress(j_msg)) 
# j_ratio_lzma      = bitsize(j_msg)  / bitsize(lzma.compress(j_msg))
# j_ratio_lz4       = bitsize(j_msg)  / bitsize(lz4.frame.compress(j_msg))
# j_ratio_zstd      = bitsize(j_msg)  / bitsize(zstd.compress(j_msg, 1))
# j_ratio_brotli    = bitsize(j_msg)  / bitsize(brotli.compress(j_msg))

# print(f'j_ratio_zlib:         {j_ratio_zlib}\
#       \nj_ratio_gzip:         {j_ratio_gzip}\
#       \nj_ratio_bz2:          {j_ratio_bz2}\
#       \nj_ratio_lzma:         {j_ratio_lzma}\
#       \nj_ration_lz4:         {j_ratio_lz4}\
#       \nj_ration_zstd(1):     {j_ratio_zstd}\
#       \nj_ration_brotli(11):  {j_ratio_brotli}\
# '
# )

## Huffman

In [None]:
# Huffman encoding algorithm

import heapq
from collections import defaultdict

class Node:
    def __init__(self, char=None, freq=0, left=None, right=None):
        self.char = char
        self.freq = freq
        self.left = left
        self.right = right

    def __lt__(self, other):
        if self.freq == other.freq:
            return self.char < other.char if self.char and other.char else False
        return self.freq < other.freq

    def __eq__(self, other):
        return self.freq == other.freq and self.char == other.char

def build_huffman_tree(freq_dict):
    priority_queue = [Node(char, freq) for char, freq in freq_dict.items()]
    heapq.heapify(priority_queue)

    while len(priority_queue) > 1:
        left = heapq.heappop(priority_queue)
        right = heapq.heappop(priority_queue)
        new_node = Node(freq=left.freq + right.freq, left=left, right=right)
        heapq.heappush(priority_queue, new_node)

    return priority_queue[0]


def build_frequency_dict(data):
    freq_dict = defaultdict(int)
    for char in data:
        freq_dict[char] += 1
    return freq_dict

def build_codewords(node, current_code="", code_dict=None):
    if code_dict is None:
        code_dict = {}

    if node.char is not None:
        code_dict[node.char] = current_code
        return code_dict

    code_dict = build_codewords(node.left, current_code + "0", code_dict)
    code_dict = build_codewords(node.right, current_code + "1", code_dict)

    return code_dict

def huffman_encode(data):
    freq_dict = build_frequency_dict(data)
    huffman_tree = build_huffman_tree(freq_dict)
    codewords = build_codewords(huffman_tree)

    encoded_data = "".join(codewords[char] for char in data)
    return encoded_data, huffman_tree

def huffman_decode(encoded_data, huffman_tree):
    decoded_data = bytearray()
    current_node = huffman_tree

    for bit in encoded_data:
        if bit == "0":
            current_node = current_node.left
        else:
            current_node = current_node.right

        if current_node.char is not None:
            decoded_data.append(ord(current_node.char))
            current_node = huffman_tree

    return bytes(decoded_data)


def serialize_tree(node):
    if node.char is not None:
        return '1' + bin(node.char)[2:].zfill(8)
    else:
        return '0' + serialize_tree(node.left) + serialize_tree(node.right)


def deserialize_tree(data):
    def helper(index):
        if data[index] == '1':
            char = chr(int(data[index + 1:index + 9], 2))
            return Node(char=char), index + 9
        else:
            left, index = helper(index + 1)
            right, index = helper(index)
            return Node(left=left, right=right), index

    root, _ = helper(0)
    return root


In [None]:
# example of huffman usage

# encoded_data, huffman_tree = huffman_encode(package_message_serialized)

# # Serialize the Huffman tree
# serialized_tree = serialize_tree(huffman_tree)

# # Deserialize the Huffman tree
# deserialized_tree = deserialize_tree(serialized_tree)

# # The rest of the code remains the same...
# # encoded_data = "".join(build_codewords(deserialized_tree)[char] for char in package_message_serialized)
# decoded_data = huffman_decode(encoded_data, deserialized_tree)

# print(f"Original data:      {package_message_serialized}")
# print(f"Decoded data:       {decoded_data}")
# print(f"Decoded data == Original data: {package_message_serialized == decoded_data}")
# print(f"Encoded data:       {encoded_data}")
# print(f"Serialized tree:    {serialized_tree}")
# print()
# print(f"Encoded data bit length:    {bitsize(encoded_data)}")
# print(f"Serialized tree bit length: {bitsize(serialized_tree)}")
# print(f"Compression ratio:          {(bitsize(encoded_data) + bitsize(serialized_tree)) / bitsize(package_message_serialized)}")

## rANS PyComP

In [None]:
# from collections import Counter
# counter = Counter(pb_msg)

# pb_msg_chr = ''.join([chr(i) for i in counter.keys()])

In [None]:
# from libs.PyComP import ANS
# msg_counter = Counter(pb_msg_chr)
# ans = ANS.rANS(list(msg_counter.keys()), list(msg_counter.values()))
# msg_enc, final_state = ans.encode([chr(i) for i in pb_msg], 0)
# msg_dec = ''.join(ans.decode(msg_enc, final_state))


In [None]:
# pb_msg_chr, pb_msg_chr, msg_enc[2:], len(msg_enc[2:])


# ! modeling

In [None]:
# Creating proto templates and compiling them for each dataset 

packages = {}
packages = {df: {"package_name": df+"_package"} for df in dfs}

for df in dfs:
    packages[df]["protobuf_template"] = create_proto_template(packages[df]["package_name"], dfs[df])

for df in dfs:
    packages[df]["proto_filename"] = packages[df]["package_name"] + ".proto"
    with open(os.path.join(PROTOS_FOLDER_ABS_PATH, packages[df]["proto_filename"]), "w") as f:
        f.write(packages[df]["protobuf_template"])
    compile_proto(packages[df]["proto_filename"])

packages

{'ASCCDCV': {'package_name': 'ASCCDCV_package',
  'protobuf_template': 'syntax = "proto3";\n\npackage ASCCDCV_package;\nmessage ASCCDCV_package_message {\n    float a_no2 = 1;\n    float a_ufp = 2;\n    float a_bc = 3;\n    float n_no2 = 4;\n    float n_ufp = 5;\n    float n_bc = 6;\n    float v_no2 = 7;\n    float v_ufp = 8;\n    float v_bc = 9;\n    float n_core_no2 = 10;\n    float n_core_ufp = 11;\n    float n_core_bc = 12;\n}',
  'proto_filename': 'ASCCDCV_package.proto'},
 'ARGAZAL': {'package_name': 'ARGAZAL_package',
  'protobuf_template': 'syntax = "proto3";\n\npackage ARGAZAL_package;\nmessage ARGAZAL_package_message {\n    float z1ufp_nw = 1;\n    float z1bc_nw = 2;\n    float z1no2_nw = 3;\n    float z1ufp_ew = 4;\n    float z1bc_ew = 5;\n    float z1no2_ew = 6;\n    float z1ufp_sw = 7;\n    float z1bc_sw = 8;\n    float z1no2_sw = 9;\n    float z2ufp_nw = 10;\n    float z2bc_nw = 11;\n    float z2no2_nw = 12;\n    float z2ufp_ew = 13;\n    float z2bc_ew = 14;\n    float z2

In [None]:
# Converting datasets from pd.DataFrame to dict for easier messages filling (~40 secs)

dfs_dict = {df: dfs[df].to_dict(orient="records") for df in dfs}

In [None]:
# Filling messages with data (might take a while (~2 min))

for df in dfs:
    print(df)
    packages[df]["package_messages"] = []
    packages[df]["package_messages"] = fill_packages_from_df(packages[df]["package_name"], dfs_dict[df])
    # for row in range(dfs[df].shape[0]):
    #     packages[df]["package_messages"].append(fill_package_from_df(packages[df]["package_name"], dfs_dict[df], row))
        

ASCCDCV
ARGAZAL
ARMAZAL
BWQAS
Exception: Assignment not allowed to message, map, or repeated field "measurement_timestamp" in protocol message object. | field: measurement_timestamp | value: NaT
Exception: Assignment not allowed to message, map, or repeated field "measurement_timestamp" in protocol message object. | field: measurement_timestamp | value: NaT
Exception: Assignment not allowed to message, map, or repeated field "measurement_timestamp" in protocol message object. | field: measurement_timestamp | value: NaT
Exception: Assignment not allowed to message, map, or repeated field "measurement_timestamp" in protocol message object. | field: measurement_timestamp | value: NaT
Exception: Assignment not allowed to message, map, or repeated field "measurement_timestamp" in protocol message object. | field: measurement_timestamp | value: NaT
Exception: Assignment not allowed to message, map, or repeated field "measurement_timestamp" in protocol message object. | field: measurement_tim

In [None]:
# Serializing messages with protobuf

for package in packages:
    packages[package]["package_messages_proto"] = []
    for package_message in packages[package]['package_messages']:
        packages[package]["package_messages_proto"].append(proto_serialize(package_message))
        

In [None]:
for package in packages:
    packages[package]["package_messages_json"] = 

In [None]:
# Example of filling message with data from df and dicts with pd.Timestamp and timestamp in str 
# 
# NOTE: json format does not specify timestamp format, so i used most commonly used - ISO8601, 
# using custom JSONEncoder/JSONDecoder

item = 0
message = dfs_dict["BWQAS"][item] # dict with pd.Timestamp as timestamp
message_json = json.dumps(message, cls=_JSONEncoder)

# package_message = fill_package_from_df(package_name, df_dict, item)
# package_message1 = fill_package_from_dict(package_name, message)
# message = json.loads(message_json) # dict with str ISO8601 as timestamp
# package_message2 = fill_package_from_dict(package_name, json.loads(message_json))

# print(f"Same?: {package_message == package_message1 == package_message2}")
print(getsizeof(message), message)
print(getsizeof(message_json), len(message_json), message_json)
with open('testWB.json', 'wb') as f:
    json.dump(message, f)

272 {'beach_name': 'Montrose Beach', 'measurement_timestamp': Timestamp('2013-08-30 08:00:00'), 'water_temperature': 20.3, 'turbidity': 1.18, 'transducer_depth': 0.891, 'wave_height': 0.08, 'wave_period': 3, 'battery_life': 9.4, 'measurement_timestamp_label': '8/30/2013 8:00 AM', 'measurement_id': 'MontroseBeach201308300800'}
353 312 {"beach_name": "Montrose Beach", "measurement_timestamp": "2013-08-30T08:00:00", "water_temperature": 20.3, "turbidity": 1.18, "transducer_depth": 0.891, "wave_height": 0.08, "wave_period": 3, "battery_life": 9.4, "measurement_timestamp_label": "8/30/2013 8:00 AM", "measurement_id": "MontroseBeach201308300800"}


TypeError: a bytes-like object is required, not 'str'

In [None]:
print(type(message))
with open('testWB.json', 'w') as f:
    json.dump(message, f, cls=_JSONEncoder)

<class 'dict'>


In [None]:
for package in packages:
    print(f"{package}: {len(packages[package]['package_messages_proto'])}")

ASCCDCV: 294938
ARGAZAL: 55139
ARMAZAL: 27074
BWQAS: 34923
BWSAS: 59144
IOTNL: 477426
IOTTEMP: 97606
IOT1: 83126
IOT2: 172249
IOT3: 169185
IOT4: 89844
IOT6: 91050
IOT7: 279612
IOT8: 70744
IOT9: 151785
IOT10: 620
IOT11: 3165
IOT12: 3590
TAZAW: 55139


In [None]:
import pprint

In [None]:
pprint.PrettyPrinter(depth=2).pprint(packages)

{'ARGAZAL': {'package_messages': [...],
             'package_messages_serialized': [...],
             'package_name': 'ARGAZAL_package',
             'proto_filename': 'ARGAZAL_package.proto',
             'protobuf_template': 'syntax = "proto3";\n'
                                  '\n'
                                  'package ARGAZAL_package;\n'
                                  'message ARGAZAL_package_message {\n'
                                  '    float z1ufp_nw = 1;\n'
                                  '    float z1bc_nw = 2;\n'
                                  '    float z1no2_nw = 3;\n'
                                  '    float z1ufp_ew = 4;\n'
                                  '    float z1bc_ew = 5;\n'
                                  '    float z1no2_ew = 6;\n'
                                  '    float z1ufp_sw = 7;\n'
                                  '    float z1bc_sw = 8;\n'
                                  '    float z1no2_sw = 9;\n'
                   