In [1]:
import json
import uuid
import os
from dotenv import load_dotenv
from pathlib import Path
from kafka import KafkaProducer
from kafka.admin import KafkaAdminClient, NewPartitions
from faker import Faker
from time import sleep
import random

In [2]:
# Load environment variables
dotenv_path = Path('/resources/.env')
load_dotenv(dotenv_path=dotenv_path)

True

In [3]:
# Kafka configuration
kafka_host = os.getenv('KAFKA_HOST')
kafka_topic = os.getenv('KAFKA_TOPIC_NAME')

In [4]:
print(f"Kafka Host: {kafka_host}")
print(f"Kafka Topic: {kafka_topic}")

Kafka Host: dataeng-kafka
Kafka Topic: test-topic


In [5]:
# Define the topic with a specific number of partitions
def create_topic_with_partitions(topic_name, num_partitions):
    try:
        # Create a new topic with the specified number of partitions
        topic = NewTopic(name=topic_name, num_partitions=num_partitions, replication_factor=1)
        admin_client.create_topics(new_topics=[topic], validate_only=False)
        print(f"Successfully created topic '{topic_name}' with {num_partitions} partitions.")
    except Exception as e:
        print(f"Error creating topic '{topic_name}': {e}")

# Call the function to create the topic with the desired number of partitions
create_topic_with_partitions(kafka_topic, 2)  # Change the number to your desired partition count

Error creating topic 'test-topic': name 'NewTopic' is not defined


In [6]:
# Initialize Kafka Producer
producer = KafkaProducer(bootstrap_servers=f'{kafka_host}:9092')
faker = Faker()

In [7]:
# Initialize Kafka Admin Client
admin_client = KafkaAdminClient(bootstrap_servers=f"{kafka_host}:9092")

In [8]:
# Validate partitions again to get the final count
topic_metadata = admin_client.describe_topics([kafka_topic])
partition_count = len(topic_metadata[0]['partitions'])
print(f"Final partition count for topic '{kafka_topic}': {partition_count}")

Final partition count for topic 'test-topic': 2


In [9]:
class DataGenerator:
    @staticmethod
    def get_data():
        """Generate random consumer data."""
        return {
            "consumer_id": str(uuid.uuid4()),
            "name": faker.name(),
            "address": faker.address().replace("\n", ", "),
            "email": faker.email(),
            "phone_number": faker.phone_number(),
            "gender": faker.random_element(elements=["Male", "Female"]),
            "birth_date": faker.date_of_birth(minimum_age=18, maximum_age=80).strftime("%Y-%m-%d"),
            "marital_status": faker.random_element(elements=["Single", "Married", "Divorced", "Widowed"]),
            "annual_income": faker.random_int(min=20000, max=200000),
            "signup_date": faker.date_this_decade().strftime("%Y-%m-%d"),
        }

In [10]:
# Number of events to send
num_events = 10  # Change this to the number of events you want to send

# Send a fixed number of events to Kafka
for i in range(num_events):
    # Generate random consumer data
    data = DataGenerator.get_data()
    payload = json.dumps(data).encode("utf-8")

    # Randomly assign partition based on available partitions
    partition = random.choice(range(partition_count))

    try:
        response = producer.send(topic=kafka_topic, value=payload, partition=partition)
        print(f"Sent to partition {partition}: {data}")
    except Exception as e:
        print(f"Error sending to partition {partition}: {e}")
    
    # Sleep for 5 seconds before sending the next event
    sleep(5)

Sent to partition 1: {'consumer_id': '01510de0-4579-4827-9bab-750a56ce2ce1', 'name': 'Patrick Smith', 'address': '8007 Jessica Tunnel Apt. 094, Churchmouth, WI 36464', 'email': 'unewman@example.com', 'phone_number': '(546)735-0604x51941', 'gender': 'Male', 'birth_date': '1979-08-28', 'marital_status': 'Married', 'annual_income': 149821, 'signup_date': '2024-03-04'}
Sent to partition 0: {'consumer_id': '8335b5b2-6d82-4c8d-9ea3-2d0ace851b58', 'name': 'Laurie Smith', 'address': '011 Zachary Track, Smithport, DC 20580', 'email': 'elopez@example.com', 'phone_number': '412.055.2865x6790', 'gender': 'Male', 'birth_date': '1990-12-22', 'marital_status': 'Divorced', 'annual_income': 155198, 'signup_date': '2023-03-15'}
Sent to partition 1: {'consumer_id': '5a0788bf-bf25-42bb-90b2-98095b122f5c', 'name': 'Tim Fernandez', 'address': '651 Alexander Run, New Courtney, DC 77612', 'email': 'olsonashley@example.com', 'phone_number': '001-755-630-4847x204', 'gender': 'Male', 'birth_date': '1976-05-23', 