In [30]:
import hopsworks
import json
import os
import pandas as pd

from confluent_kafka import Producer

In [31]:
project = hopsworks.login()

Connection closed.
2024-10-17 22:39:29,429 INFO: Python Engine initialized.

Logged in to project, explore it here https://demo.hops.works/p/123


In [32]:
# create kafka topic
KAFKA_TOPIC_NAME = f"{project.name}_real_time_live_transactions"
SCHEMA_NAME = "live_transactions_schema"

kafka_api = project.get_kafka_api()

schema = {
    "type": "record",
    "name": SCHEMA_NAME,
    "namespace": "ai.hopsworks.examples.feldera.fraud",
    "fields": [
        {
            "name": "tid",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "date_time",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "account_id",
            "type": [
                "null",
                "long"
            ]
        },
        {
            "name": "amount",
            "type": [
                "null",
                "double"
            ]
        },
    ]
}

kafka_api.create_schema(SCHEMA_NAME, schema)
kafka_api.create_topic(KAFKA_TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1)

KafkaTopic('payment_fraud_real_time_live_transactions')

In [33]:
# setup kafka producer
kafka_config = kafka_api.get_default_config()

print(kafka_config)
producer = Producer(kafka_config)

{'security.protocol': 'SSL', 'ssl.ca.location': '/tmp/ca_chain.pem', 'ssl.certificate.location': '/tmp/client_cert.pem', 'ssl.key.location': '/tmp/client_key.pem', 'client.id': 'jupyter-payment-fraud--fabio000-778d467fdb-htxcp', 'group.id': 'my-group-id', 'ssl.endpoint.identification.algorithm': 'none', 'bootstrap.servers': 'kafka-cluster-kafka-0.kafka-cluster-kafka-brokers.hopsworks.svc:9092'}


In [34]:
transactions_pdf = pd.read_csv(f"{os.environ['PROJECT_PATH']}/Jupyter/RawData/historical_transactions.csv")
transactions_pdf = transactions_pdf[['tid', 'date_time', 'account_id', 'amount']]

In [35]:
transactions_pdf

Unnamed: 0,tid,date_time,account_id,amount
0,f2747ffa038b12b8568a8cdf60504e0b,2024-04-20 17:18:14,16d0762b977594acf65b570b685766a4,70.78
1,0596a96da829a9a343b1ba526e8556a6,2024-04-20 17:20:14,015443a3f5aa7916f14c66d3d3bfb0d5,39.42
2,5f140eb04ba7257d4afdee345a7ed5e5,2024-04-20 17:24:31,cafdc9010b1145e3ee5f3aba1f96cc6a,44.22
3,66009ba1b5ef90a5b2be7cc75312a093,2024-04-20 17:28:45,73bb37cd9253957f537f580aa71be0e5,26.52
4,239727d9e0e67190d1f98dac2896eeab,2024-04-20 17:38:37,bbd0bbf42aadedbcd902580d170df617,69.33
...,...,...,...,...
71317,7efceeff1e5b6d90d7064ed3324d6353,2024-08-18 04:39:46,e6cacaf243f3aeb4cbc988ac3f0d62e6,505.11
71318,836f1040738d8802d664533b8d0b094f,2024-08-14 02:39:46,e6cacaf243f3aeb4cbc988ac3f0d62e6,531.13
71319,97b6ee0d90e093bff11ff48fe2e0ed48,2024-08-10 00:39:46,e6cacaf243f3aeb4cbc988ac3f0d62e6,83.65
71320,7ab38e90f7e6669115bdbecb281e52cb,2024-08-05 22:39:46,e6cacaf243f3aeb4cbc988ac3f0d62e6,5684.61


In [36]:
data = json.loads(transactions_pdf.to_json(orient="records"))

In [37]:
data[1]

{'tid': '0596a96da829a9a343b1ba526e8556a6',
 'date_time': '2024-04-20 17:20:14',
 'account_id': '015443a3f5aa7916f14c66d3d3bfb0d5',
 'amount': 39.42}

In [38]:
batch_size = 0
for t in data:
    producer.produce(KAFKA_TOPIC_NAME, json.dumps(t))
    batch_size += 1
    
    if batch_size == 10:
        producer.flush()
        batch_size = 0