# 1. Producing the data
In this task, we will implement one Apache Kafka producer to simulate real-time data streaming. Spark is not allowed in this part since it’s simulating a streaming data source.

1.1 Your program should send one batch of click_stream data every 5 seconds. One batch consists of a random 500-1000 rows from the clickstream_rt dataset. The CSV shouldn’t be loaded to memory at once to conserve memory (i.e. Read row as needed).  
1.2 For each row, add an integer column named ‘ts’, a Unix timestamp in seconds since the epoch (UTC timezone). Spead your batch out evenly for 5 seconds.  
For example, if you send a batch of 600 records at 2023-09-01 00:00:00 (ISO format: YYYY-MM-DD HH:MM:SS) -> (ts = 1693526400) :  
Record 1-120: ts = 1693526400  
Record 121-240: ts = 1693526401  
Record 241-360: ts = 1693526402  
….  
1.3 Send your batch to a Kafka topic with an appropriate name.  

All the data except for the ‘ts’ column should be sent in the original String type, without changing to any other types.  


In [None]:
from time import sleep
from kafka3 import KafkaProducer
import random
import datetime as dt
import csv

# Configuration
hostip = "118.138.83.246"  # Change me



def read_csv_batch(file_name, batch_size):
    with open(file_name, 'rt') as f:
        reader = csv.DictReader(f)
        batch = []
        for row in reader:
            batch.append(row)
            
            # if reach the specified batch size 
            # yield a seq of batch then clear the list
            if len(batch) >= batch_size:
                yield batch
                batch = []

        # yield if not empty list
        if batch:
            yield batch

def publish_message(producer_instance, topic_name, key, value):
    try:
        key_bytes = bytes(key, encoding='utf-8')
        value_bytes = bytes(str(value), encoding='utf-8')
        producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
        producer_instance.flush()
#         print(data)
        
#         for i, record in enumerate(value):
#             value_str = str(record)
#             value_bytes = bytes(value_str, encoding='utf-8')
#             print(value)
#             producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
#             print(f'Record {i + 1} published successfully. ts = {timestamp + i}')
        
        # evenly spread out of 5s
#         record_per_second = total_records//5
#         producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
        producer_instance.flush()
        
        # printing message
#         print(f'Batch published successfully. Number of records: {total_records}')
#         print(f'- Record 1-{record_per_second}: ts = {timestamp}')
#         print(f'- Record {record_per_second+1}-{2*record_per_second}: ts = {timestamp+1}')
#         print(f'- Record {2*record_per_second+1}-{3*record_per_second}: ts = {timestamp+2}')
#         print(f'- Record {3*record_per_second+1}-{4*record_per_second}: ts = {timestamp+3}')
#         print(f'- Record {4*record_per_second+1}-{total_records}: ts = {timestamp+4}')
# #         print(value)
    
    except Exception as ex:
        print('Exception in publishing batch.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=[f'{hostip}:9092'], 
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer
    
    
if __name__ == '__main__':
    # setting topic name
    # & reading file
    topic = 'clickstream_realtime'
    csv_file = 'click_stream_rt.csv'

    print('Publishing records..')
    
    # connect to kafka producer
    producer = connect_kafka_producer()
    
    while True:
        # random size of batch: 500 to 1000
        batch_size = random.randint(500, 1000)
        
        # unix timestamp
        timestamp = int(dt.datetime.now().timestamp())
        
        # random draw of sample to form a batch with size of batch_size
        batch = random.sample(list(read_csv_batch(csv_file, batch_size)), 1)[0]  
        total_records = len(batch)
#         print(batch)
        lst = []
        record_per_second = total_records//5
        
        # loop thr each record in the batch
        for i, record in enumerate(batch):
#             # finding ts of each batch
#             record['ts'] = timestamp + i
            if i in range(0,record_per_second-1):
                record['ts'] = timestamp 
            elif i in range(record_per_second,2*record_per_second-1):
                record['ts'] = timestamp +1
            elif i in range(2*record_per_second,3*record_per_second-1):
                record['ts'] = timestamp +2
            elif i in range(3*record_per_second,4*record_per_second-1):
                record['ts'] = timestamp +3
            else: 
                record['ts'] = timestamp +4
                
                
            # value = data ie each row info
            data = {'session_id': record['session_id'],
                    'event_name': record['event_name'],
                    'event_id': record['event_id'],
                    'customer_id': record['customer_id'],
                    'event_metadata':record['event_metadata'],
                    'customer_id':record['customer_id'],
                    'ts': record['ts']}
            lst.append(data)
            publish_message(producer, topic, 'parsed', data)
#         print(lst)
        record_per_second = total_records//5
        print(f'Batch published successfully. Number of records: {total_records}')
        print(f'- Record 1-{record_per_second}: ts = {timestamp}')
        print(f'- Record {record_per_second+1}-{2*record_per_second}: ts = {timestamp+1}')
        print(f'- Record {2*record_per_second+1}-{3*record_per_second}: ts = {timestamp+2}')
        print(f'- Record {3*record_per_second+1}-{4*record_per_second}: ts = {timestamp+3}')
        print(f'- Record {4*record_per_second+1}-{total_records}: ts = {timestamp+4}')

        sleep(5)

Publishing records..
Batch published successfully. Number of records: 991
- Record 1-198: ts = 1697770945
- Record 199-396: ts = 1697770946
- Record 397-594: ts = 1697770947
- Record 595-792: ts = 1697770948
- Record 793-991: ts = 1697770949
Batch published successfully. Number of records: 678
- Record 1-135: ts = 1697770952
- Record 136-270: ts = 1697770953
- Record 271-405: ts = 1697770954
- Record 406-540: ts = 1697770955
- Record 541-678: ts = 1697770956
Batch published successfully. Number of records: 793
- Record 1-158: ts = 1697770959
- Record 159-316: ts = 1697770960
- Record 317-474: ts = 1697770961
- Record 475-632: ts = 1697770962
- Record 633-793: ts = 1697770963
Batch published successfully. Number of records: 935
- Record 1-187: ts = 1697770966
- Record 188-374: ts = 1697770967
- Record 375-561: ts = 1697770968
- Record 562-748: ts = 1697770969
- Record 749-935: ts = 1697770970
Batch published successfully. Number of records: 742
- Record 1-148: ts = 1697770972
- Record 1

Batch published successfully. Number of records: 532
- Record 1-106: ts = 1697771182
- Record 107-212: ts = 1697771183
- Record 213-318: ts = 1697771184
- Record 319-424: ts = 1697771185
- Record 425-532: ts = 1697771186
Batch published successfully. Number of records: 808
- Record 1-161: ts = 1697771188
- Record 162-322: ts = 1697771189
- Record 323-483: ts = 1697771190
- Record 484-644: ts = 1697771191
- Record 645-808: ts = 1697771192
Batch published successfully. Number of records: 755
- Record 1-151: ts = 1697771195
- Record 152-302: ts = 1697771196
- Record 303-453: ts = 1697771197
- Record 454-604: ts = 1697771198
- Record 605-755: ts = 1697771199
Batch published successfully. Number of records: 749
- Record 1-149: ts = 1697771201
- Record 150-298: ts = 1697771202
- Record 299-447: ts = 1697771203
- Record 448-596: ts = 1697771204
- Record 597-749: ts = 1697771205
Batch published successfully. Number of records: 695
- Record 1-139: ts = 1697771207
- Record 140-278: ts = 16977712

<BrokerConnection node_id=1 host=118.138.83.246:9092 <connected> [IPv4 ('118.138.83.246', 9092)]> timed out after 30000 ms. Closing connection.
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing metadata
Node 1 connection failed -- refreshing m

Batch published successfully. Number of records: 974
- Record 1-194: ts = 1697771417
- Record 195-388: ts = 1697771418
- Record 389-582: ts = 1697771419
- Record 583-776: ts = 1697771420
- Record 777-974: ts = 1697771421
Batch published successfully. Number of records: 545
- Record 1-109: ts = 1697773282
- Record 110-218: ts = 1697773283
- Record 219-327: ts = 1697773284
- Record 328-436: ts = 1697773285
- Record 437-545: ts = 1697773286
Batch published successfully. Number of records: 675
- Record 1-135: ts = 1697773329
- Record 136-270: ts = 1697773330
- Record 271-405: ts = 1697773331
- Record 406-540: ts = 1697773332
- Record 541-675: ts = 1697773333
Batch published successfully. Number of records: 581
- Record 1-116: ts = 1697773335
- Record 117-232: ts = 1697773336
- Record 233-348: ts = 1697773337
- Record 349-464: ts = 1697773338
- Record 465-581: ts = 1697773339
Batch published successfully. Number of records: 810
- Record 1-162: ts = 1697773341
- Record 163-324: ts = 16977733

Batch published successfully. Number of records: 891
- Record 1-178: ts = 1697773620
- Record 179-356: ts = 1697773621
- Record 357-534: ts = 1697773622
- Record 535-712: ts = 1697773623
- Record 713-891: ts = 1697773624
Batch published successfully. Number of records: 802
- Record 1-160: ts = 1697773627
- Record 161-320: ts = 1697773628
- Record 321-480: ts = 1697773629
- Record 481-640: ts = 1697773630
- Record 641-802: ts = 1697773631
