In [1]:
import time
import re
from confluent_kafka import Producer

# Kafka producer configuration
conf = {
    'bootstrap.servers': 'localhost:9092',  # Replace with your Kafka broker address
    'client.id': 'file-producer'
}
producer = Producer(conf)

# File path (change to your .txt file path)
file_path = 'shakespeare.txt'

# Word occurrence dictionary
word_count = {}

# Delivery report callback function
def delivery_report(err, msg):
    if err is not None:
        print(f"Message delivery failed: {err}")
    else:
        print(f"Message delivered to {msg.topic()} [{msg.partition}] at offset {msg.offset()}")

# Function to send data to Kafka topic
def send_to_kafka(content, cursor_pos):
    try:
        # Send message to Kafka topic 'file-streaming'
        producer.produce('file-streaming', key=str(cursor_pos).encode('utf-8'), value=content.encode('utf-8'), callback=delivery_report)
        producer.flush()  # Ensure message is sent immediately
        print(f"Sent content at position {cursor_pos}: {content.strip()}")
    except Exception as e:
        print(f"Error while sending message: {e}")

    # Track word occurrences
    words = re.findall(r'\w+', content.lower())  # Split content into words (case insensitive)
    for word in words:
        word_count[word] = word_count.get(word, 0) + 1

    # Log word count for the content
    print(f"Updated word count (last 5 words): {dict(list(word_count.items())[-5:])}")  # Show last 5 words added

# Function to process and send content from the .txt file
def process_file():
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            print(f"Processing file: {file_path}")
            
            # Read file line by line
            cursor_pos = 0
            for line in file:
                cursor_pos += len(line)  # Increment cursor position by the length of the line
                print(f"Processing line at position {cursor_pos}, content: {line.strip()}")
                send_to_kafka(line, cursor_pos)
                time.sleep(0.5)  # Simulate real-time processing
            print(f"Completed processing of file: {file_path}")
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Please check the file path.")
    except Exception as e:
        print(f"Error: {e}")

# Run the file processing once
if __name__ == "__main__":
    print("Starting to process the file...")
    process_file()


Starting to process the file...
Processing file: shakespeare.txt
Processing line at position 45, content: From fairest creatures we desire increase,
Message delivered to file-streaming [<built-in method partition of cimpl.Message object at 0x000001CD84817A40>] at offset 0
Sent content at position 45: From fairest creatures we desire increase,
Updated word count (last 5 words): {'fairest': 1, 'creatures': 1, 'we': 1, 'desire': 1, 'increase': 1}
Processing line at position 91, content: That thereby beauty's rose might never die,
Message delivered to file-streaming [<built-in method partition of cimpl.Message object at 0x000001CD84817A40>] at offset 1
Sent content at position 91: That thereby beauty's rose might never die,
Updated word count (last 5 words): {'s': 1, 'rose': 1, 'might': 1, 'never': 1, 'die': 1}
Processing line at position 134, content: But as the riper should by time decease,
Message delivered to file-streaming [<built-in method partition of cimpl.Message object at 0x00000