In [1]:
import logging
from sys import stdout

formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler(stdout)
console_handler.setFormatter(formatter)

logger = logging.getLogger('opensky.spark_consumer')
logger.addHandler(console_handler)
logger.setLevel('DEBUG')


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
import pandas as pd

# # Topics/Brokers
topic_real_time_states = "real-time-states"
topic_raw_json = 'raw_json'
topic_flat_json = 'flat_json'
# topic_test = "topic_test"
broker = "localhost:9092"

host_name = 'cnt7-naya-cdh6'
hive_host = "localhost"
hdfs_host = "localhost"
hdfs_port = 8020


In [3]:
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.1 pyspark-shell'

In [4]:
spark = SparkSession \
    .builder \
    .appName("StructuredRealTimeState") \
    .getOrCreate()

In [5]:
from pyspark.sql import types as T
from pyspark.sql import functions as F
import json

schema = T.ArrayType(T.StructType()\
.add("time", T.TimestampType())\
 .add("icao24", T.StringType())\
 .add("callsign", T.StringType())\
.add("last_contact", T.TimestampType())\
 .add("longitude", T.FloatType())\
  .add("latitude", T.FloatType())\
  .add("baro_altitude", T.FloatType())\
  .add("on_ground", T.IntegerType())\
  .add("velocity", T.FloatType())\
  .add("geo_altitude", T.FloatType())\
  .add("squawk", T.StringType())\
  .add("position_source", T.IntegerType()))

In [6]:
# df = spark \
#     .read \
#     .format("kafka") \
#     .option("kafka.bootstrap.servers", broker) \
#     .option("subscribe", topic_real_time_states) \
#     .option("startingOffsets", "earliest")\
#     .load()\
#     .select(F.from_json(F.col("value").cast("string"), schema).alias("value"))\
#     .select((F.explode("value").alias("value")))\
#     .select("value.*")
    
# #.selectExpr("CAST(value AS STRING)") 



In [7]:
import pyarrow as pa
import re
from datetime import datetime, timedelta
import urllib
from impala.dbapi import connect

def drop_old_partitions(impala_conn: connect, table_name:str, table_src_path:str, 
                             partition_name:str, earliest_time_to_keep:datetime.timestamp):
    """
    Drop old partitions at path <table_src_path>, drop all partitions older than <earliest_time_to_keep>
    First drop partitions through impala client (impala.dbapi), then delete folders
    """
    logger.debug(f'dropping old partitions for table {table_name}, all partitions oldert than {earliest_time_to_keep}')
    
    fs = pa.hdfs.connect(
            host=hdfs_host, 
            port=hdfs_port, 
            user='hdfs', 
            kerb_ticket=None, 
            driver='libhdfs', 
            extra_conf=None)


    partition_pattern = r'(.+\=(.+))'
    partitions_paths = fs.ls(table_src_path)
    partitions_tup = [re.findall(partition_pattern, partition_path) for partition_path in partitions_paths]
    # partitions_tup is of form : (partition path, partition date)
    # urllib.parse.unquote in order to turn special chars as %3A to regular (:) , aslo flatten the list
    partitions_tup = [(pt[0][0], urllib.parse.unquote(pt[0][1])) for pt in partitions_tup if len(pt) > 0]
    partitions_dict = {date : path for path, date in partitions_tup}

    partitions_to_delete = [p_d for p_d in partitions_dict.keys() if datetime.strptime(p_d, '%Y-%m-%d %H:%M:%S') < earliest_time_to_keep]

    # here partitions to_delete holds all partitions that should be deleted

    crsr = impala_conn.cursor()

    try:
        for part_key in partitions_to_delete:
            crsr.execute(f'alter table {table_name} drop if exists partition ({partition_name}="{part_key}");')
            fs.delete(partitions_dict[part_key], recursive=True)
            logger.debug(f'deleted : {partitions_dict[part_key]}')
    except Exception as ex:
        logger.Error(ex)
    finally:
        crsr.close()
        
# earliest_time_to_keep = datetime.now() - timedelta(minutes=450)
# table_src_path = '/user/naya/FinalProject/last_hour'
# drop_old_partitions('opensky_network', "states_last_hour", table_src_path, "date_minute", earliest_time_to_keep)

In [8]:
import pyarrow as pa
# from pyhive import hive
from datetime import datetime, timedelta
from impala.dbapi import connect
from os import path
# import ibis.impala.client
# import ibis.impala.api
# import ibis.impala.ddl

# # # Impala connection details:impala_host = 'localhost'
# hdfs_host = 'localhost'
# hdfs_port = 9870

# impala_port = 21050
# opensky_network_db = 'playground'
# username = 'naya'
# password = 'naya'
# host = 'localhost'

# hdfs = ibis.hdfs_connect(
#   host = host, 
#   port = hdfs_port, 
#   protocol = 'webhdfs', 
#   use_https = 'default', 
#   auth_mechanism = 'NOSASL')

# impala_client = ibis.impala.connect(
#   host = host, 
#   port = impala_port, 
#   database = opensky_network_db, 
#   user = username, 
#   password = password, 
#   pool_size = 8, 
#   hdfs_client = hdfs)

last_hour_table_name = 'last_hour'
target_database = 'opensky_network'

# root_data_path = '/user/naya/FinalProject/'
# last_hour_path = path.join(root_data_path, 'last_hour')
# last_day_path = path.join(root_data_path, 'last_day')
# last_week_path = path.join(root_data_path,'last_week')

def write_to_impala(df: DataFrame, epoch_id):
    root_data_path = '/user/naya/FinalProject/'
    last_hour_path = path.join(root_data_path, 'last_hour')
    last_day_path = path.join(root_data_path, 'last_day')
    last_week_path = path.join(root_data_path,'last_week')
    
    logger.info(f'write_to_hive: epoch_id: {epoch_id} len: {df.count()}')
    # write to each table - minutes, hours, days
    if df.count() != 0:
        df.persist()
        logger.debug(f'trying to write to : {last_hour_path}')
        df.withColumn('date_minute', F.date_trunc('minute', df.time)).write\
                    .mode("append")\
                    .partitionBy('date_minute')\
                    .parquet(f'hdfs://localhost:8020/{last_hour_path}')
        logger.debug(f'Trying to write to : {last_day_path}')
        df.withColumn('date_hour', F.date_trunc('hour', df.time)).write\
                    .mode("append")\
                    .partitionBy('date_hour')\
                    .parquet(f'hdfs://localhost:8020/{last_day_path}')
#         df.unpersist()

        impala_conn = connect(host=host_name, database = target_database, user = 'naya', password = 'naya', auth_mechanism = 'NOSASL')

        drop_old_partitions(impala_conn, 'states_last_hour', last_hour_path, 
                                 'date_minute', datetime.now() - timedelta(hours=1))
        drop_old_partitions(impala_conn, 'states_last_day', last_day_path, 
                                 'date_hour', datetime.now() - timedelta(hours=24))

        impala_crsr = impala_conn.cursor()
        try:
            for table_name in ['states_last_hour', 'states_last_day']:
                impala_crsr.execute(f'alter table {table_name} recover partitions;')
                impala_crsr.execute(f'refresh {table_name};')
        except Exception as ex:
            logger.error(ex)
        finally:
            impala_crsr.close()
            
#         crsr.execute(f'show partitions {table_name};')
#         logger.debug([d for d, *rest in crsr.fetchall()])
    
#     impala_client.table(last_hour_table_name).drop_partition('date_minute=2019-12-28 12:13:00')
#     impala_client.table(last_hour_table_name).alter()
#     impala_client.table(last_hour_table_name).refresh()




In [9]:
from pyspark.sql import functions as F

df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", broker) \
    .option("subscribe", topic_real_time_states) \
    .option("startingOffsets", "latest")\
    .load()

state_vectors_df = df.select(F.from_json(F.col("value").cast("string"), schema).alias("value"))\
                        .select((F.explode("value").alias("value")))\
                        .select("value.*")

# agg_count = state_vectors_df.withWatermark("time", "1 minute").groupBy("time").count()
# query = agg_count\
#             .writeStream\
#             .outputMode("append")\
#             .format("console") \
#             .start()

# # Write to parquet file
# TBD - handle target path creation
parquet_path = 'hdfs://cnt7-naya-cdh6.org:8020/FinalProject/hourly'
parquet_checkpoint_path = "/home/naya/parquet_checkpoint"
parquet_write = state_vectors_df\
                .withColumn('date_hour', F.date_trunc('hour', state_vectors_df.time))\
                .writeStream\
                .outputMode("append")\
                .format("parquet")\
                .partitionBy('date_hour')\
                .option("checkpointLocation", parquet_checkpoint_path)\
                .option("path", parquet_path)\
                .start()



# query.awaitTermination()


In [10]:
hive_write = state_vectors_df\
            .writeStream\
            .foreachBatch(write_to_impala)\
            .start()

2019-12-30 22:31:57,041 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 0 len: 0
2019-12-30 22:32:06,310 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 1 len: 7997
2019-12-30 22:32:06,673 - opensky.spark_consumer - DEBUG - trying to write to : /user/naya/FinalProject/last_hour
2019-12-30 22:32:07,937 - opensky.spark_consumer - DEBUG - Trying to write to : /user/naya/FinalProject/last_day
2019-12-30 22:32:08,215 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_hour, all partitions oldert than 2019-12-30 21:32:08.215108
2019-12-30 22:32:09,590 - opensky.spark_consumer - DEBUG - deleted : /user/naya/FinalProject/last_hour/date_minute=2019-12-30 21%3A08%3A00
2019-12-30 22:32:09,625 - opensky.spark_consumer - DEBUG - deleted : /user/naya/FinalProject/last_hour/date_minute=2019-12-30 21%3A09%3A00
2019-12-30 22:32:09,657 - opensky.spark_consumer - DEBUG - deleted : /user/naya/FinalProject/last_hour/date_minute=2019-12-30 21%3A10%3A00
2019

2019-12-30 22:33:51,283 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 8 len: 8006
2019-12-30 22:33:51,751 - opensky.spark_consumer - DEBUG - trying to write to : /user/naya/FinalProject/last_hour
2019-12-30 22:33:52,333 - opensky.spark_consumer - DEBUG - Trying to write to : /user/naya/FinalProject/last_day
2019-12-30 22:33:52,493 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_hour, all partitions oldert than 2019-12-30 21:33:52.493205
2019-12-30 22:33:52,616 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_day, all partitions oldert than 2019-12-29 22:33:52.616603
2019-12-30 22:34:06,863 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 9 len: 8045
2019-12-30 22:34:07,364 - opensky.spark_consumer - DEBUG - trying to write to : /user/naya/FinalProject/last_hour
2019-12-30 22:34:08,373 - opensky.spark_consumer - DEBUG - Trying to write to : /user/naya/FinalProject/last_day
2019-12-30 22:34:09,009 - o

2019-12-30 22:37:01,815 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_day, all partitions oldert than 2019-12-29 22:37:01.815239
2019-12-30 22:37:17,626 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 20 len: 8054
2019-12-30 22:37:18,146 - opensky.spark_consumer - DEBUG - trying to write to : /user/naya/FinalProject/last_hour
2019-12-30 22:37:18,743 - opensky.spark_consumer - DEBUG - Trying to write to : /user/naya/FinalProject/last_day
2019-12-30 22:37:18,873 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_hour, all partitions oldert than 2019-12-30 21:37:18.873676
2019-12-30 22:37:18,995 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_day, all partitions oldert than 2019-12-29 22:37:18.995288
2019-12-30 22:37:36,918 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 21 len: 8040
2019-12-30 22:37:37,448 - opensky.spark_consumer - DEBUG - trying to write to : /user/na

2019-12-30 22:40:55,383 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_day, all partitions oldert than 2019-12-29 22:40:55.383544
2019-12-30 22:41:14,240 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 32 len: 7985
2019-12-30 22:41:14,766 - opensky.spark_consumer - DEBUG - trying to write to : /user/naya/FinalProject/last_hour
2019-12-30 22:41:15,362 - opensky.spark_consumer - DEBUG - Trying to write to : /user/naya/FinalProject/last_day
2019-12-30 22:41:15,512 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_hour, all partitions oldert than 2019-12-30 21:41:15.512021
2019-12-30 22:41:15,566 - opensky.spark_consumer - DEBUG - deleted : /user/naya/FinalProject/last_hour/date_minute=2019-12-30 21%3A41%3A00
2019-12-30 22:41:15,671 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_day, all partitions oldert than 2019-12-29 22:41:15.670801
2019-12-30 22:41:35,954 - opensky.spark_consu

2019-12-30 22:45:21,156 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_hour, all partitions oldert than 2019-12-30 21:45:21.156770
2019-12-30 22:45:21,205 - opensky.spark_consumer - DEBUG - deleted : /user/naya/FinalProject/last_hour/date_minute=2019-12-30 21%3A45%3A00
2019-12-30 22:45:21,309 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_day, all partitions oldert than 2019-12-29 22:45:21.308900
2019-12-30 22:45:42,624 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 44 len: 8062
2019-12-30 22:45:43,144 - opensky.spark_consumer - DEBUG - trying to write to : /user/naya/FinalProject/last_hour
2019-12-30 22:45:43,723 - opensky.spark_consumer - DEBUG - Trying to write to : /user/naya/FinalProject/last_day
2019-12-30 22:45:43,853 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_hour, all partitions oldert than 2019-12-30 21:45:43.853604
2019-12-30 22:45:43,972 - opensky.spark_cons

2019-12-30 22:49:29,351 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 55 len: 8047
2019-12-30 22:49:29,859 - opensky.spark_consumer - DEBUG - trying to write to : /user/naya/FinalProject/last_hour
2019-12-30 22:49:30,426 - opensky.spark_consumer - DEBUG - Trying to write to : /user/naya/FinalProject/last_day
2019-12-30 22:49:30,559 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_hour, all partitions oldert than 2019-12-30 21:49:30.559308
2019-12-30 22:49:30,684 - opensky.spark_consumer - DEBUG - dropping old partitions for table states_last_day, all partitions oldert than 2019-12-29 22:49:30.684470
2019-12-30 22:49:43,424 - opensky.spark_consumer - INFO - write_to_hive: epoch_id: 56 len: 8088
2019-12-30 22:49:43,940 - opensky.spark_consumer - DEBUG - trying to write to : /user/naya/FinalProject/last_hour
2019-12-30 22:49:44,553 - opensky.spark_consumer - DEBUG - Trying to write to : /user/naya/FinalProject/last_day
2019-12-30 22:49:44,707 -

In [42]:
# query.isActive
# parquet_write.recentProgress
# query.lastProgress

In [35]:
print(parquet_write.isActive)
# print(query.isActive)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:41285)
Traceback (most recent call last):
  File "/home/naya/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/naya/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:41285)

In [12]:
query.stop()

In [46]:
parquet_write.stop()

In [34]:
hive_write.isActive

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:41285)
Traceback (most recent call last):
  File "/home/naya/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/naya/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:41285)

In [22]:
hive_write.stop()

SyntaxError: keyword can't be an expression (<ipython-input-16-7ba5617340db>, line 1)

select date_trunc('hour', time), cast(last_contact as timestamp) from states limit 20;


CREATE EXTERNAL TABLE playground.states_hourly
(time TIMESTAMP, icao24 STRING, callsign STRING, last_contact TIMESTAMP,
longitude FLOAT, latitude FLOAT, baro_altitude FLOAT, on_ground INT,   
velocity FLOAT, geo_altitude FLOAT, squawk STRING, position_source INT)
PARTITIONED BY (date_hour string)
STORED AS PARQUET 
LOCATION 'hdfs://cnt7-naya-cdh6.org:8020/FinalProject/hourly';


SHOW CREATE TABLE states;