In [1]:
import json
import uuid
import os
from dotenv import load_dotenv
from pathlib import Path
from kafka import KafkaProducer
from kafka.admin import KafkaAdminClient, NewTopic, NewPartitions
from kafka.errors import TopicAlreadyExistsError
from faker import Faker
from time import sleep
import random

In [2]:
# Load environment variables
dotenv_path = Path('/resources/.env')
load_dotenv(dotenv_path=dotenv_path)

True

In [3]:
# Kafka configuration
kafka_host = os.getenv('KAFKA_HOST')
kafka_topic = os.getenv('KAFKA_TOPIC_NAME')

In [4]:
print(f"Kafka Host: {kafka_host}")
print(f"Kafka Topic: {kafka_topic}")

Kafka Host: dataeng-kafka
Kafka Topic: test-topic


In [5]:
# Initialize Kafka Admin Client
admin_client = KafkaAdminClient(bootstrap_servers=f"{kafka_host}:9092")

In [6]:
def create_or_update_topic(topic_name, num_partitions):
    try:
        # Attempt to create a new topic
        topic = NewTopic(name=topic_name, num_partitions=num_partitions, replication_factor=1)
        admin_client.create_topics(new_topics=[topic], validate_only=False)
        print(f"Successfully created topic '{topic_name}' with {num_partitions} partitions.")
    except TopicAlreadyExistsError:
        print(f"Topic '{topic_name}' already exists. Checking partitions...")
        # If the topic exists, check the current number of partitions
        topic_metadata = admin_client.describe_topics([topic_name])
        current_partition_count = len(topic_metadata[0]['partitions'])
        print(f"Current partition count for topic '{topic_name}': {current_partition_count}")
        
        # Increase the number of partitions if necessary
        if current_partition_count < num_partitions:
            print(f"Increasing partitions for topic '{topic_name}' to {num_partitions}...")
            admin_client.create_partitions(
                topic_partitions={topic_name: NewPartitions(total_count=num_partitions)}
            )
            print(f"Successfully updated partitions for topic '{topic_name}'.")
        else:
            print(f"Topic '{topic_name}' already has {current_partition_count} partitions.")

In [7]:
# Call the function to create or update the topic
create_or_update_topic(kafka_topic, 2)  # Change the number to your desired partition count

Topic 'test-topic' already exists. Checking partitions...
Current partition count for topic 'test-topic': 2
Topic 'test-topic' already has 2 partitions.


In [8]:
# Validate partitions again to get the final count
topic_metadata = admin_client.describe_topics([kafka_topic])
partition_count = len(topic_metadata[0]['partitions'])
print(f"Final partition count for topic '{kafka_topic}': {partition_count}")

Final partition count for topic 'test-topic': 2


In [9]:
faker = Faker()

In [10]:
class DataGenerator:
    @staticmethod
    def get_data():
        """Generate random consumer data."""
        return {
            "consumer_id": str(uuid.uuid4()),
            "name": faker.name(),
            "address": faker.address().replace("\n", ", "),
            "email": faker.email(),
            "phone_number": faker.phone_number(),
            "gender": faker.random_element(elements=["Male", "Female"]),
            "birth_date": faker.date_of_birth(minimum_age=18, maximum_age=80).strftime("%Y-%m-%d"),
            "marital_status": faker.random_element(elements=["Single", "Married", "Divorced", "Widowed"]),
            "annual_income": faker.random_int(min=20000, max=200000),
            "signup_date": faker.date_this_decade().strftime("%Y-%m-%d"),
        }

In [11]:
# Initialize Kafka Producer
producer = KafkaProducer(bootstrap_servers=f'{kafka_host}:9092')

In [12]:
# Number of events to send
num_events = 10  # Change this to the number of events you want to send

# Send a fixed number of events to Kafka
for i in range(num_events):
    # Generate random consumer data
    data = DataGenerator.get_data()
    payload = json.dumps(data).encode("utf-8")

    # Randomly assign partition based on available partitions
    partition = random.choice(range(partition_count))

    try:
        response = producer.send(topic=kafka_topic, value=payload, partition=partition)
        print(f"Sent to partition {partition}: {data}")
    except Exception as e:
        print(f"Error sending to partition {partition}: {e}")
    
    # Sleep for 5 seconds before sending the next event
    sleep(5)

Sent to partition 0: {'consumer_id': '2ad0a0ec-ab22-42f6-83dc-a937f8d31c2f', 'name': 'Valerie Roy', 'address': '50435 Sellers Stravenue Apt. 837, Khanchester, MT 89544', 'email': 'ochoalucas@example.net', 'phone_number': '001-383-912-7920x44977', 'gender': 'Male', 'birth_date': '1948-09-04', 'marital_status': 'Widowed', 'annual_income': 180154, 'signup_date': '2020-05-20'}
Sent to partition 1: {'consumer_id': 'e65b7bca-08ea-42e9-8d3c-df4d28bec5ce', 'name': 'Justin Clayton', 'address': '537 Carter Springs, South Meganport, IL 80550', 'email': 'bradleymoreno@example.org', 'phone_number': '(428)618-6090x3915', 'gender': 'Female', 'birth_date': '1992-03-29', 'marital_status': 'Divorced', 'annual_income': 31570, 'signup_date': '2022-01-10'}
Sent to partition 0: {'consumer_id': 'f336df85-8d12-40d2-a82b-7859ede8347f', 'name': 'Mark Dennis', 'address': '611 Megan Stravenue, New Shelleyside, MT 35710', 'email': 'bruceestrada@example.net', 'phone_number': '(372)971-3725', 'gender': 'Female', 'bi