In [None]:
# pyspark functions
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, StructType, StructField, TimestampType, StringType, LongType
import pyspark

In [None]:
# Utility functions for connecting and mounting AWS S3 Bucket
%run "/Users/neeraj.adsul@gmail.com/aws_connector"

In [None]:
# Data cleaning functions for transforming the pinterest dataframes
%run "/Users/neeraj.adsul@gmail.com/pinterest_data_cleaning"

In [None]:
ACCESS_KEY, ENCODED_SECRET_KEY, SECRET_KEY = get_aws_credentials(credentials='authentication_credentials.csv')

## Schema Definitions for Kinesis Streaming Source Transformation 

In [None]:
# Schema for topic `pin`
pin_schema = StructType([
    StructField("index", LongType(), True),
    StructField("unique_id",StringType(), True),
    StructField("title",StringType(), True),
    StructField("description",StringType(), True),
    StructField("poster_name",StringType(), True),
    StructField("follower_count",StringType(), True),
    StructField("tag_list",StringType(), True),
    StructField("is_image_or_video",StringType(), True),
    StructField("image_src",StringType(), True),
    StructField("downloaded",StringType(), True),
    StructField("save_location",StringType(), True),
    StructField("category",StringType(), True),
])

# Schema for topic `geo`
geo_schema = StructType([
    StructField("ind",LongType(), True),
    StructField("timestamp",StringType(), True),
    StructField("latitude",StringType(), True),
    StructField("longitude",StringType(), True),
    StructField("country",StringType(), True),
])

# Schema for topic `user`
user_schema = StructType([
    StructField("ind",LongType(), True),
    StructField("first_name",StringType(), True),
    StructField("last_name",StringType(), True),
    StructField("age",StringType(), True),
    StructField("date_joined",StringType(), True),
])


In [None]:
def build_topical_stream(topic: str, schema: StructType):
    """Create a streaming dataframe for the given topic.

    topic: topic name, `pin` or `geo` or `user`
    schema: schema specification for the topic to correctly tranform streamed data into a Spark dataframe.

    returns:
        Spark streaming dataframe

    """
    stream_name = f'streaming-0a1d8948160f-{topic}'
    return spark \
        .readStream \
        .format('kinesis') \
        .option('streamName',stream_name) \
        .option('initialPosition','earliest') \
        .option('region','us-east-1') \
        .option('awsAccessKey', ACCESS_KEY) \
        .option('awsSecretKey', SECRET_KEY) \
        .load() \
        .selectExpr('cast (data as STRING) jsonData') \
        .select(from_json('jsonData', schema=schema).alias('df')) \
        .select('df.*')

def write_stream_delta_table(topic):
    pass

### Build and clean the streaming data pipeline for the three topics

In [None]:
df_pin = build_topical_stream('pin', pin_schema)
df_geo = build_topical_stream('geo', geo_schema)
df_user = build_topical_stream('user', user_schema)

In [None]:
df_pin_clean = clean_pin_dataframe(df_pin)
df_geo_clean = clean_geo_dataframe(df_geo)
df_user_clean = clean_user_dataframe(df_user)

### Write the clean stream to Delta tables

In [None]:
for topic, df in zip(['pin', 'geo', 'user'], [df_pin_clean, df_geo_clean, df_user_clean]):
    table_name = f'0a1d8948160f_{topic}_table'
    df.writeStream \
        .format('delta') \
        .outputMode('append') \
        .option('checkpointLocation', '/tmp/kinesis/0a1d8948160f/_checkpoints/') \
        .option('region', 'us-east-1') \
        .table(table_name)