# Datafaucet

Datafaucet is a productivity framework for ETL, ML application. Simplifying some of the common activities which are typical in Data pipeline such as project scaffolding, data ingesting, start schema generation, forecasting etc.

In [1]:
from kafka import KafkaConsumer, TopicPartition
from json import loads, dumps
from time import sleep

In [2]:
consumer = KafkaConsumer(
    'dfc',
     bootstrap_servers=['kafka:9092'],
     auto_offset_reset='earliest',
     enable_auto_commit=False,
     group_id=None,
     value_deserializer=lambda x: loads(x.decode('utf-8')))

In [3]:
p = consumer.partitions_for_topic('dfc')
partitions = [TopicPartition('dfc', x) for x in list(p)]
partitions

[TopicPartition(topic='dfc', partition=0)]

In [4]:
consumer.beginning_offsets(partitions)

{TopicPartition(topic='dfc', partition=0): 0}

In [5]:
while not list(consumer.end_offsets(partitions).values())[0]:
    print(consumer.end_offsets(partitions))
    sleep(1)

consumer.end_offsets(partitions)

{TopicPartition(topic='dfc', partition=0): 17}

In [6]:
consumer.seek_to_beginning(partitions[0])

In [7]:
message = consumer.poll(timeout_ms=1000, max_records=1)
cnt = len(message.values())
if cnt:
    record = list(message.values())[0][0]
    print(f'[consumer]: topic={record.topic}, partition={record.partition}, offset={record.offset}, timestamp={record.timestamp}')
    print(f'[datafaucet log data]:')
    print(dumps(record.value, indent=2))
else:
    print('No data in the queue')

[consumer]: topic=dfc, partition=0, offset=0, timestamp=1576632569243
[datafaucet log data]:
{
  "@timestamp": "2019-12-18T01:29:28.899548",
  "severity": "INFO",
  "sid": "0xd5220070213511ea",
  "repohash": 0,
  "reponame": "",
  "username": "jovyan",
  "filepath": "logging.ipynb",
  "funcname": "notebook:cell",
  "message": "info",
  "data": null
}


In [8]:
consumer.beginning_offsets(partitions)
consumer.seek(partitions[0], 7)
consumer.position(partitions[0])

7

In [9]:
message = consumer.poll(timeout_ms=1000, max_records=1)

cnt = len(message.values())
d = {'severity':None, 'message':None, 'data':None}

if cnt:
    record = list(message.values())[0][0]
    print(f'[consumer]: topic={record.topic}, partition={record.partition}, offset={record.offset}, timestamp={record.timestamp}')
    print(f'[datafaucet log data]:')
    print(dumps(record.value, indent=2))
    d = record.value
else:
    print('No data in the queue')

[consumer]: topic=dfc, partition=0, offset=7, timestamp=1576632572193
[datafaucet log data]:
{
  "@timestamp": "2019-12-18T01:29:32.192604",
  "sid": "0xd5220070213511ea",
  "repohash": 0,
  "reponame": "",
  "username": "jovyan",
  "filepath": "logging.ipynb",
  "funcname": "notebook:cell",
  "message": "custom data + message",
  "data": {
    "test_value": 42
  }
}


In [10]:
assert d['severity'] == 'WARNING'
assert d['message'] == 'custom data + message'
assert d['data'] == {'test_value': 42}