In [1]:
!pip install joblib

[0m

In [2]:
!pip install scikit-learn

[0m

In [3]:
import json
import random
import time
from datetime import datetime
import numpy as np
import logging
import socket
from confluent_kafka import Producer, Consumer
import random
from joblib import dump
import numpy as np
from sklearn.ensemble import IsolationForest

In [4]:
DELAY = 2
OUTLIERS_GENERATION_PROBABILITY = 0.2
KAFKA_BROKER = "kafka:9092"
TRANSACTIONS_TOPIC = "transactions"

In [5]:
rng = np.random.RandomState(42)

# Generate train data
X = 0.3 * rng.randn(500, 2)
X_train = np.r_[X + 2, X - 2]
X_train = np.round(X_train, 3)

# fit the model
clf = IsolationForest(n_estimators=50, max_samples=500, random_state=rng, contamination=0.01)
clf.fit(X_train)

dump(clf, './isolation_forest.joblib')

['./isolation_forest.joblib']

Producer

In [6]:
def create_producer():
    try:
        producer = Producer({"bootstrap.servers": KAFKA_BROKER,
                             "client.id": socket.gethostname(),
                             "enable.idempotence": True,  # EOS processing
                             "compression.type": "lz4",
                             "batch.size": 64000,
                             "linger.ms": 10,
                             "acks": "all",  # Wait for the leader and all ISR to send response back
                             "retries": 5,
                             "delivery.timeout.ms": 1000})  # Total time to make retries
    except Exception as e:
        logging.exception("Couldn't create the producer")
        producer = None
    return producer


def create_consumer(topic, group_id):
    try:
        consumer = Consumer({"bootstrap.servers": KAFKA_BROKER,
                             "group.id": group_id,
                             "client.id": socket.gethostname(),
                             "isolation.level": "read_committed",
                             "default.topic.config": {"auto.offset.reset": "latest", # Only consume new messages
                                                      "enable.auto.commit": False}
                             })

        consumer.subscribe([topic])
    except Exception as e:
        logging.exception("Couldn't create the consumer")
        consumer = None

    return consumer

In [7]:
_id = 0
producer = create_producer()

if producer is not None:
    try:
        while True:
            # Generate some abnormal observations
            if random.random() <= OUTLIERS_GENERATION_PROBABILITY:
                X_test = np.random.uniform(low=-4, high=4, size=(1, 2))
            else:
                X = 0.3 * np.random.randn(1, 2)
                X_test = (X + np.random.choice(a=[2, -2], size=1, p=[0.5, 0.5]))

            X_test = np.round(X_test, 3).tolist()

            current_time = datetime.utcnow().isoformat()

            record = {"id": _id, "data": X_test, "current_time": current_time}
            record = json.dumps(record).encode("utf-8")
            print('produce message')
            print(record)

            producer.produce(topic=TRANSACTIONS_TOPIC,
                            value=record)
            producer.flush()
            _id += 1
            time.sleep(DELAY)
    except KeyboardInterrupt: print("Stopped")
    finally:
        producer.flush()

produce message
b'{"id": 0, "data": [[-0.044, 2.454]], "current_time": "2025-10-27T09:04:39.650987"}'
produce message
b'{"id": 1, "data": [[-2.127, -2.041]], "current_time": "2025-10-27T09:04:42.685863"}'
produce message
b'{"id": 2, "data": [[-1.84, -1.954]], "current_time": "2025-10-27T09:04:45.430450"}'
produce message
b'{"id": 3, "data": [[-0.409, -2.916]], "current_time": "2025-10-27T09:04:47.462326"}'
produce message
b'{"id": 4, "data": [[-2.368, -1.702]], "current_time": "2025-10-27T09:04:49.497868"}'
produce message
b'{"id": 5, "data": [[3.938, -0.867]], "current_time": "2025-10-27T09:04:51.516150"}'
produce message
b'{"id": 6, "data": [[-2.28, -1.79]], "current_time": "2025-10-27T09:04:53.571008"}'
produce message
b'{"id": 7, "data": [[2.093, 2.935]], "current_time": "2025-10-27T09:04:55.599707"}'
produce message
b'{"id": 8, "data": [[-2.058, -1.861]], "current_time": "2025-10-27T09:04:57.632818"}'
produce message
b'{"id": 9, "data": [[-3.887, -2.682]], "current_time": "2025-10