In [1]:
import logging
from sys import stdout

formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler(stdout)
console_handler.setFormatter(formatter)

logger = logging.getLogger('opensky.producer')
logger.addHandler(console_handler)
logger.setLevel('INFO')

In [2]:
from opensky_api import OpenSkyApi
from kafka import KafkaProducer
import json
from time import sleep
import pyarrow as pa
import os
from datetime import datetime
import shutil

# # Topics/Brokers
topic_real_time_states = 'real-time-states'
topic_sparse_states = 'sparse_states'
brokers = ['localhost:9092']


size_mb = lambda x : x/1024/1024

API = OpenSkyApi('livgeni', '1abc23')

from requests.exceptions import ReadTimeout

def open_sky_generator(rate_s:int):
    while True:
        try:
            states = API.get_states()
        except ReadTimeout as rte:
            logger.warning(rte)
        except Exception as e:
            logger.error(e)
        else:
            yield states
        sleep(rate_s)


RED_FIELDS = ['origin_country', 'sensors']
YELLOW_FIELDS = ['time_position', 'heading', 'vertical_rate', 'spi']
IGNORE_FIELDS = RED_FIELDS + YELLOW_FIELDS


def opensky_to_dict(opensky_states):
    """returns a list of dictionaries"""
    svdl = list()
    
    try:
        for sv in opensky_states.states:
            svd = dict(time = opensky_states.time)
            for key in sv.keys:
                if key not in IGNORE_FIELDS:
                    val = sv.__dict__[key]
                    # Adapt to work with Spark Stream that has 0/1 for bool
                    if type(val) == bool:
                        svd[key] = 0 if val == False else 1
                    else:
                        svd[key] = val if type(val) != str else val.strip()
            svdl += [svd]
    except Exception as e:
        log.error(e)
    return svdl
#     state_vector_json_list = json.dumps(svdl)
#     return state_vector_json_list


        
class OpenskyArchiver:
    
    def __init__(self):
        # hdfs config
        self.hdfs_config = dict(
                            hdfs_host='localhost',
                            hdfs_port=8020,
                            hdfs_user='hdfs',
                            hdfs_driver='libhdfs',
                            hdfs_archive_path = '/FinalProject/Archive'
                               )
        # local file configs
        self.local_files_folder = os.path.abspath('/home/naya/tutorial/open-sky/tmp_storage')
        self.local_files_prefix = 'opensky_state_vectors_json'
        self.local_file_size_threshold = 100 #MB
        self.gen_time_suffix = lambda : datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
        self.gen_local_file_path = lambda : os.path.join\
                                    (self.local_files_folder,\
                                     f'{self.local_files_prefix}_{self.gen_time_suffix()}.json')
        self.local_file_path = self.gen_local_file_path()
        
        # init local and hdfs folders
        fs = pa.hdfs.connect(
            host=self.hdfs_config['hdfs_host'],
            port=self.hdfs_config['hdfs_port'], 
            user=self.hdfs_config['hdfs_user'], 
            kerb_ticket=None, 
            driver=self.hdfs_config['hdfs_driver'], 
            extra_conf=None)

        # Create local folder if not exists
        if os.path.exists(self.local_files_folder):
            shutil.rmtree(self.local_files_folder)
        os.makedirs(self.local_files_folder)

        # create hdfs folder if not exist
        hdfs_archive_path = self.hdfs_config['hdfs_archive_path']
        if not fs.exists(hdfs_archive_path):
            fs.mkdir(hdfs_archive_path, create_parents=True)
        
        self.fs = fs
        
    def archive_data(self, jsn_str):
        """
        load as json current file if exists, and append the new json string and dump back to the file
        if file does not exist create new and dump the new file    
        """ 
        if os.path.isfile(self.local_file_path) and os.path.getsize(self.local_file_path) > 0:
            with open(self.local_file_path, 'r') as local_file:
                file_jsn = json.load(local_file)
        else:
            file_jsn = json.loads('[]')
        with open(self.local_file_path, 'w') as local_file:
            new_jsn = json.loads(jsn_str)
            file_jsn.extend(new_jsn)
            json.dump(file_jsn, local_file)
#             local_file.write(jsn_str)
        # test if target file size reached
        if size_mb(os.path.getsize(self.local_file_path)) >= self.local_file_size_threshold:
            # upload to hdfs
            logger.debug(f'Uploading to HDFS')
            with open(self.local_file_path, 'rb') as source:
                target_file_name = os.path.split(self.local_file_path)[1]
                target_full_path = f'{self.hdfs_config["hdfs_archive_path"]}/{target_file_name}'
                self.fs.upload(target_full_path, source)
            logger.debug(f'Uploaded to : {target_full_path}')
            # delete the local file
            logger.debug(f'deleting local file {self.local_file_path}')
            os.remove(self.local_file_path)
            # new name for new file
            self.local_file_path = self.gen_local_file_path()
            logger.debug(f'generated new local_file_name :{os.path.split(self.local_file_path)[1]}')

## Save to Archiving Folder in HDFS

1. Append to a local file until reaching 100MB
2. When reached - upload to HDFS using pyarrow and open a new file

In [None]:
producer = KafkaProducer(bootstrap_servers=brokers, max_request_size = 4299162)

archiver = OpenskyArchiver()
rt_rate_s = 10
sparse_counter = 0
alerts_rate_s = 3 * 60


for states in open_sky_generator(rt_rate_s):
    if states is None:
        logger.warning('Received empty states vector')
        continue
    
    message_json = json.dumps(opensky_to_dict(states))
    # First archive the message
    archiver.archive_data(message_json)
    # Then send to kafka
    try:
        send_result = producer.send(topic_real_time_states, value = message_json.encode('utf-8'))
        if send_result.exception:
            logger.error(f'producer send ecxeption: {send_result.exception}')
            logger.debug(f"sent time : {datetime.fromtimestamp(states.time)} ; len of sent message : {len(message_json)}")
    
        if (sparse_counter * rt_rate_s) >= alerts_rate_s:
            sparse_counter = 0
            send_result = producer.send(topic_sparse_states, value = message_json.encode('utf-8'))
            if send_result.exception:
                logger.error(f'producer send ecxeption: {send_result.exception}')
                logger.debug(f"sent time : {datetime.fromtimestamp(states.time)} ; len of sent message : {len(message_json)}")
        else:
            sparse_counter += 1
    except Exception as ex:
        logger.error(ex)

    