# 1. Data Producer

## **Attention**

This project simulates the streaming data ingestion-to-prediction and visualization workflow: we publish data from a local system to Kafka via a producer as the first step of the pipeline.

---

## **1.1 Library**

In [5]:
import time
import numpy as np
import re
import pandas as pd

In [6]:
from kafka3 import KafkaProducer,KafkaAdminClient
from json import dumps
from pandas import Series

---

## **1.2 Pre-processing**

In [None]:
## ===================================================== Explanation
# Sorting and imputing the data timeline
## ===================================================== Implementation
## ----------------------------------------------------- Params definition
file_path = "../DataStorage/Weather/"
file_name = "weather.csv"
save_name = "weather_sorted.csv"
## ----------------------------------------------------- Loading data
def to_hour(targetSeries:Series) -> Series:
    """ Convert timestamp into hourly unit """
    return targetSeries / 1e09 / 60 / 60
def to_millisecond(targetSeries:Series) -> Series:
    """ Convert hourly timestamp into millisecond unit """
    return targetSeries * 1e09 * 60 * 60
data = pd.read_csv(file_path + file_name)
## ----------------------------------------------------- Convert timestamp to integer
data.loc[:,"timestamp"] = pd.to_datetime(
    data.loc[:,"timestamp"]
).astype('int').astype("float")
## ----------------------------------------------------- Get the minimum timestamp
min_time_stamp = to_hour(
    data
    .loc[:,["site_id","timestamp"]]
    .groupby("site_id")["timestamp"]
    .min()
    .reset_index()
    .loc[:,"timestamp"]
    .min()
)
## ----------------------------------------------------- Get the maximum timestamp
max_time_stamp = to_hour(
    data
    .loc[:,["site_id","timestamp"]]
    .groupby("site_id")["timestamp"]
    .max()
    .reset_index()
    .loc[:,"timestamp"]
    .max()
)
## ----------------------------------------------------- Create full timeline
full_time_line = to_millisecond(np.arange(min_time_stamp,max_time_stamp,1.))
_N = full_time_line.shape[0]
full_site_id = np.repeat(np.arange(16),_N)
full_time_series = pd.DataFrame({
    "timestamp":list(full_time_line) * 16,
    "site_id":full_site_id
})
## ----------------------------------------------------- Join to impute time series
data_full_time = (
    pd.merge(
        data,full_time_series,
        on  = ["site_id","timestamp"],
        how   = "left"
    )
    ## ------------------------------------------------- Sorting
    .sort_values(
        by = ["site_id","timestamp"],
        ascending    = [True,True],
        ignore_index = True
    )
)
# ----------------------------------------------------- Transform back to string type
data_full_time.loc[:,"timestamp"] = pd.to_datetime(
    data_full_time.loc[:,"timestamp"].astype("int")
).astype("str")
# ----------------------------------------------------- Add extra zero
def add_zero(target):
    return target + ".000"
data_full_time.loc[:,"timestamp"] = data_full_time.loc[:,"timestamp"].apply(add_zero)
## ----------------------------------------------------- File saving
data_full_time.to_csv(file_path + save_name, index=False)

---

## **1.4 Producing**

In [8]:
## ===================================================== Explanation
# Delete the topic for testing purpose, refer to:
#   https://kafka-python.readthedocs.io/en/master/apidoc/KafkaAdminClient.html
## ===================================================== Implementation
## ----------------------------------------------------- Admin Client Creation
admin = KafkaAdminClient(
    bootstrap_servers = f"kafka:9092",
    api_version = (0,10),
    client_id="topic-cleaner"
)

In [14]:
## ===================================================== Explanation
# Producing the data to Kafka
## ===================================================== Implementation
host_ip = "kafka"
file_name = save_name
topic = "stream_weather_data"
## ----------------------------------------------------- Producer Creation
producer = KafkaProducer(
    bootstrap_servers = [f'{host_ip}:9092'],
    value_serializer  = lambda x: dumps(x).encode('ascii'),
    api_version       = (0, 10)
)
## ----------------------------------------------------- Sending
counter = 0
def extract_value(targetString:str) -> list:
    """ Extract value from list """
    result = re.sub(pattern=r"\n",repl=r"",string=targetString)
    return re.split(pattern=r",", string=result)
# Time gap pointer
with open(file=file_path + file_name, mode="rt") as f:
    header = f.readline()
    header = extract_value(header)
    while True:
        try:
            # Value list
            value = []
            # 5 days
            for i in range(5):
                time.sleep(1)
                # Get sending time
                current_time = int(time.time())
                if i == 0:
                    start_time_stamp = current_time
                # 24 hours per day
                for _ in range(24):
                    # Read a line
                    line = f.readline()
                    # Splitting
                    line = extract_value(line)
                    # Form dictionary
                    line_dict = {
                        header[0]:line[0],
                        header[1]:line[1],
                        header[2]:line[2],
                        header[3]:line[3],
                        header[4]:line[4],
                        header[5]:line[5],
                        header[6]:line[6],
                        header[7]:line[7],
                        "weather_ts":current_time
                    }
                    # Update value list
                    value.append(line_dict)
                counter += 1
            # Sending
            for message in value:
                producer.send(
                    topic = topic,
                    value = message
                )
            producer.flush()
            # Update time counter
            print(
                f"day range: {counter-4:03d} - {counter:03d} | "
                f"start timestamp: {start_time_stamp:10d} | "
                f"last timestamp: {current_time:10d} | "
                f"number of message: {len(value)}"
            )
        except KeyboardInterrupt:
            admin.delete_topics(topics=["FIT5202_ASS2B_reader"])
            print("delete topics")
            time.sleep(0.5)
            break

day range: 001 - 005 | start timestamp: 1761568788 | last timestamp: 1761568792 | number of message: 120
day range: 006 - 010 | start timestamp: 1761568793 | last timestamp: 1761568797 | number of message: 120
day range: 011 - 015 | start timestamp: 1761568798 | last timestamp: 1761568802 | number of message: 120
day range: 016 - 020 | start timestamp: 1761568803 | last timestamp: 1761568807 | number of message: 120


Node 1 connection failed -- refreshing metadata


delete topics


---