In [1]:
from kafka import KafkaConsumer

topic_name = 'testnew'
group_id = 'test'
bootstrap_servers = ['localhost:9092']

consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers=bootstrap_servers,
    group_id=group_id,
    auto_offset_reset='earliest', 
    enable_auto_commit=False )

for i,  message in enumerate(consumer):
    print(message.value)
    if i == 3:
        consumer.close()

b'{"id": "1aa07a59-ce40-4248-af6a-9273d71693f1", "name": "Isabella Mitchell", "email": "brianbowman@example.org", "date_time": "2024-05-22T07:20:12.383027", "country": "Turkmenistan", "company": "Brown, Hernandez and King", "job": "English as a second language teacher", "phone": "242-707-1499x248", "sentence": "Thousand million different author say term.", "number": 93}'
b'{"id": "036efd9a-ac47-490d-95f0-4c52ab8d9fee", "name": "Denise Williams", "email": "christina48@example.com", "date_time": "2024-02-10T15:11:55.355994", "country": "Namibia", "company": "Freeman-Harris", "job": "Television camera operator", "phone": "696.429.5574", "sentence": "Avoid lot travel method race require whole.", "number": 41}'
b'{"id": "5192d616-bc41-4726-b0ef-f64e0fe53fd6", "name": "Andrea White", "email": "jacksonjason@example.org", "date_time": "2024-07-01T13:08:46.923739", "country": "Maldives", "company": "Wilkins Group", "job": "Environmental consultant", "phone": "312.769.8460", "sentence": "Leg soo

In [7]:
import json
a = message.value.decode()

json.loads(a)

{'key': '4043ab72-91cd-47dd-9d1b-8c56a5df8e1b',
 'value': {'id': '4043ab72-91cd-47dd-9d1b-8c56a5df8e1b',
  'name': 'Charles Kemp',
  'email': 'abryant@example.net',
  'date': '2024-06-06T04:55:32.828762',
  'country': 'Marshall Islands',
  'company': 'Hernandez, Guerrero and Stevens',
  'job': 'Scientist, audiological',
  'phone': '(681)644-4927',
  'sentence': 'Yourself fear address recent adult.',
  'number': 25}}

In [17]:
from pyspark.sql.types import *
import pandas as pd

class DataUtil:
    def __init__(self) -> None:
        pass
    
    @staticmethod
    def _get_one_kafka_record(topic_name, bootstrap_servers, group_id=None):
        if not group_id:
            group_id = 'read_one_record'
            
        consumer = KafkaConsumer(
            topic_name,
            bootstrap_servers=bootstrap_servers,
            group_id=group_id,
            auto_offset_reset='earliest', 
            enable_auto_commit=False )
        try:
            for i, c in enumerate(consumer):
                if c is not None:
                    # this is real string in one record
                    return c.value.decode('utf-8')
                if i == 10:
                    # not sure here needed?
                    break
            print("Not get")
        finally:
            consumer.close()

    @staticmethod
    def _infer_kafka_data_schema(input_topic, bootstrap_servers, group_id=None, return_engine='flink'):
        # todo: for spark and pyflink schema is different, change it.
        kafka_record = DataUtil._get_one_kafka_record(input_topic, bootstrap_servers, group_id=group_id)
        if not kafka_record:
            print("Couldn't get one record from kafka topic: {}".format(input_topic))
            return None

        # based on record to get value, and it's schema
        record_json = json.loads(kafka_record)
        value_json = record_json['value']
        
        df = pd.json_normalize(value_json)
        
        if return_engine == 'flink':
            schema = {}
            for col, dtype in zip(df.columns, df.dtypes):
                if dtype == 'int64':
                    schema[col] = "INT"
                elif dtype == 'float64':
                    schema[col] = "DOUBLE"
                elif dtype == 'bool':
                    schema[col] = "BOOLEAN"
                elif pd.api.types.is_datetime64_any_dtype(dtype):
                    schema[col] = "TIMESTAMP"
                else:
                    schema[col] = "STRING"
            return schema
        else:
            # convert to structure type for spark
            schema = {}
            for col, dtype in zip(df.columns, df.dtypes):
                if dtype == 'int64':
                    schema[col] = IntegerType()
                elif dtype == 'float64':
                    schema[col] = FloatType()
                elif dtype == 'bool':
                    schema[col] = BooleanType()
                else:
                    schema[col] = StringType()
                    
            field_list = []
            for c, t in schema.items():
                field = StructField(c, t, True)
                field_list.append(field)
            schema = StructType(field_list) 
            return schema

input_topic = topic_name
bootstrap_servers = bootstrap_servers

DataUtil._infer_kafka_data_schema(topic_name, bootstrap_servers, return_engine='flink')      


{'id': 'STRING',
 'name': 'STRING',
 'email': 'STRING',
 'date': 'STRING',
 'country': 'STRING',
 'company': 'STRING',
 'job': 'STRING',
 'phone': 'STRING',
 'sentence': 'STRING',
 'number': 'INT',
 'timestamp': 'DOUBLE'}