# 1. Producing the data
## implement one Apache Kafka producer to simulate the real-time data transfer from one repository to another.


## Configuration

- if using pandas to read csv, need to specify dtype=str or need to transform them back to String before sending -> otherwise, pandas would try to infer schema for each column

## Kafka Producer Methods

In [None]:
# import statements
from time import sleep
from json import dumps
from kafka3 import KafkaProducer
import random
import datetime as dt
import csv
import pandas as pd
import datetime
import pprint as pp
from datetime import timedelta
import json
from json import dumps

#configuration
#home ip
hostip = "192.168.8.133"
#uni up
# hostip = "118.138.78.178"

def read_file(file):
    """
    Reads a file into a pandas dataframe
    """
    df = pd.read_csv(file,encoding='utf-8',dtype=str)
    #parse date
    df['Date'] = pd.to_datetime(df['Date'],infer_datetime_format=True)
    #create mask of year 2011 and sort values
    df = (df[df['Date'].dt.year == 2011]
          .sort_values(['Date'],
                    ascending=[True]))
    return df

def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)
        print('Message published successfully. Data: ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=[f'{hostip}:9092'],
                                  #dumps passes a dictionary into a string
                                  #encondes the string as ascii
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

if __name__ == '__main__':
    #setup
    print('Publishing records..')
    producer = connect_kafka_producer()
    topic = 'assignment2b'
    file = read_file('data/produce_data.csv')
    
    start_date = min(file['Date'])
    end_date = max(file['Date'])
    delta = timedelta(weeks=1)

    while True:
        #get timestamp
        ts = int(dt.datetime.now().timestamp())
        #slide dataframe by date 
        entries = (file.loc[file['Date']==start_date,:]
                   .astype(str)
                   .copy())
        entries['ts'] = ts
        entries = entries.to_dict('records')
        #go to next week
        start_date += delta
        #publish message
        publish_message(producer,topic,entries)
        #reset to start from beggining
        if (start_date > end_date):
            start_date = min(file['Date'])
        #sleep
        sleep(5)

Publishing records..
Message published successfully. Data: [{'Store': '1', 'Date': '2011-01-07', 'Temperature': '48.27', 'Fuel_Price': '2.976', 'MarkDown1': 'nan', 'MarkDown2': 'nan', 'MarkDown3': 'nan', 'MarkDown4': 'nan', 'MarkDown5': 'nan', 'CPI': '211.40474', 'Unemployment': '7.742', 'IsHoliday': 'false', 'last_weekly_sales': '1367320.0062122345', 'ts': 1675496520}, {'Store': '32', 'Date': '2011-01-07', 'Temperature': '23.78', 'Fuel_Price': '2.882', 'MarkDown1': 'nan', 'MarkDown2': 'nan', 'MarkDown3': 'nan', 'MarkDown4': 'nan', 'MarkDown5': 'nan', 'CPI': '191.36838', 'Unemployment': '8.818', 'IsHoliday': 'false', 'last_weekly_sales': '955463.8401894569', 'ts': 1675496520}, {'Store': '14', 'Date': '2011-01-07', 'Temperature': '34.32', 'Fuel_Price': '3.193', 'MarkDown1': 'nan', 'MarkDown2': 'nan', 'MarkDown3': 'nan', 'MarkDown4': 'nan', 'MarkDown5': 'nan', 'CPI': '182.59831', 'Unemployment': '8.549', 'IsHoliday': 'false', 'last_weekly_sales': '1623716.4598999023', 'ts': 1675496520}, 