In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import urllib

# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")


In [0]:
%sql
-- Disable format checks during the reading of Delta tables
SET spark.databricks.delta.formatCheck.enabled=false

key,value
spark.databricks.delta.formatCheck.enabled,False


In [0]:
from pyspark.sql.functions import explode, map_keys, col

def get_kenesis_stream(stream_name):
    df_spark = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName',stream_name) \
    .option('initialPosition','earliest') \
    .option('region','us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
    
    df = df_spark.selectExpr("CAST(data as STRING)")
    return df

pin_df = get_kenesis_stream('streaming-0ecac53030fd-pin')
geo_df = get_kenesis_stream('streaming-0ecac53030fd-geo')
user_df = get_kenesis_stream('streaming-0ecac53030fd-user')


In [0]:
display(pin_df)

data
"{""index"":7528,""unique_id"":""fbe53c66-3442-4773-b19e-d3ec6f54dddf"",""title"":""No Title Data Available"",""description"":""No description available Story format"",""poster_name"":""User Info Error"",""follower_count"":""User Info Error"",""tag_list"":""N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e"",""is_image_or_video"":""multi-video(story page format)"",""image_src"":""Image src error."",""downloaded"":0,""save_location"":""Local save in /data/mens-fashion"",""category"":""mens-fashion""}"
"{""index"":2863,""unique_id"":""9bf39437-42a6-4f02-99a0-9a0383d8cd70"",""title"":""25 Super Fun Summer Crafts for Kids - Of Life and Lisa"",""description"":""Keep the kids busy this summer with these easy diy crafts and projects. Creative and…"",""poster_name"":""Of Life & Lisa | Lifestyle Blog"",""follower_count"":""124k"",""tag_list"":""Summer Crafts For Kids,Fun Crafts For Kids,Summer Kids,Toddler Crafts,Crafts To Do,Diy For Kids,Summer Snow,Diys For Summer,Craft Ideas For Girls"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/b3/bc/e2/b3bce2964e8c8975387b39660eed5f16.jpg"",""downloaded"":1,""save_location"":""Local save in /data/diy-and-crafts"",""category"":""diy-and-crafts""}"
"{""index"":5730,""unique_id"":""1e1f0c8b-9fcf-460b-9154-c775827206eb"",""title"":""Island Oasis Coupon Organizer"",""description"":""Description Coupon Organizer in a fun colorful fabric -island oasis, Great Size for the \""basic\"" couponer - holds up to 500 coupons with ease, and is made long enough so that you… "",""poster_name"":""Consuelo Aguirre"",""follower_count"":""0"",""tag_list"":""Grocery Items,Grocery Coupons,Care Organization,Coupon Organization,Extreme Couponing,Couponing 101,Life Binder,Save My Money,Love Coupons"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/65/bb/ea/65bbeaf458907bb079317d8303c4fa0e.jpg"",""downloaded"":1,""save_location"":""Local save in /data/finance"",""category"":""finance""}"
"{""index"":8304,""unique_id"":""5b6d0913-25e4-43ab-839d-85d5516f78a4"",""title"":""The #1 Reason You’re Not His Priority Anymore - Matthew Coast"",""description"":""#lovequotes #matchmaker #matchmadeinheaven #loveyourself #respectyourself"",""poster_name"":""Commitment Connection"",""follower_count"":""51k"",""tag_list"":""Wise Quotes,Quotable Quotes,Words Quotes,Wise Words,Quotes To Live By,Great Quotes,Motivational Quotes,Inspirational Quotes,Funny Quotes"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/c6/64/ee/c664ee71524fb5a6e7b7b49233f93b43.png"",""downloaded"":1,""save_location"":""Local save in /data/quotes"",""category"":""quotes""}"
"{""index"":8731,""unique_id"":""ea760f71-febf-4023-b592-d17396659039"",""title"":""20 Koi Fish Tattoos For Lucky Men"",""description"":""Koi fish tattoos are a popular choice for men who want to make a statement, thanks to their rich symbolism and bold design."",""poster_name"":""TheTrendSpotter"",""follower_count"":""211k"",""tag_list"":""Dr Tattoo,Wörter Tattoos,Pisces Tattoos,Tatoo Art,Dream Tattoos,Dope Tattoos,Mini Tattoos,Finger Tattoos,Body Art Tattoos"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/8a/0c/0a/8a0c0a7b6236565c519acd41ad1a52c0.jpg"",""downloaded"":1,""save_location"":""Local save in /data/tattoos"",""category"":""tattoos""}"
"{""index"":1313,""unique_id"":""44662045-e891-4821-8a19-ebe7eedd371a"",""title"":""Liquid Lash Extensions Mascara"",""description"":""Instantly create the look of lash extensions with this award-winning, best-selling mascara that won't clump, flake or smudge. Available in 3 shades!"",""poster_name"":""Thrive Causemetics"",""follower_count"":""43k"",""tag_list"":""N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e"",""is_image_or_video"":""video"",""image_src"":""https://i.pinimg.com/videos/thumbnails/originals/69/84/e2/6984e20f3e262098fa9c0614c3453254.0000001.jpg"",""downloaded"":1,""save_location"":""Local save in /data/beauty"",""category"":""beauty""}"
"{""index"":4315,""unique_id"":""21b59ba9-829d-4c33-8c27-4cd4c56d26b8"",""title"":""Podcasts for Teachers or Parents of Teenagers"",""description"":""Podcasts for Teachers or Parents of Teenagers: Teaching teens middle school and high school can feel joyful and rewarding most days, but can also frustrate you with one challeng… "",""poster_name"":""Math Giraffe"",""follower_count"":""25k"",""tag_list"":""Middle School Classroom,High School Students,High School Teachers,Middle School Tips,High School Counseling,Ela Classroom,High School Science,Future Classroom,Google Classroom"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/50/19/31/501931a27ee4d076658980851b995b2c.jpg"",""downloaded"":1,""save_location"":""Local save in /data/education"",""category"":""education""}"
"{""index"":10794,""unique_id"":""c4bd2577-a7bb-4409-bb7a-17d5ed7e1cf1"",""title"":""TireBuyer"",""description"":""Nissan GT-R. Sick."",""poster_name"":""Ray Uyemura"",""follower_count"":""437"",""tag_list"":""Lowrider,Old Vintage Cars,Antique Cars,Austin Martin,Nissan Gtr Black,Jaguar,1959 Cadillac,Cadillac Ct6,Old School Cars"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/0d/29/9f/0d299f3df020395aa7ce8387f40fbeed.jpg"",""downloaded"":1,""save_location"":""Local save in /data/vehicles"",""category"":""vehicles""}"
"{""index"":5494,""unique_id"":""8fb2af68-543b-4639-8119-de33d28706ed"",""title"":""Dave Ramsey's 7 Baby Steps: What Are They And Will They Work For You"",""description"":""If you love budgeting, make sure to give Dave Ramsey's 7 Baby Steps a try. Follow these steps to begin your debt snowball, build an emergency fund, invest and reach riches. I ca… "",""poster_name"":""Living Low Key | Save Money, Make Money, & Frugal Living"",""follower_count"":""26k"",""tag_list"":""Financial Peace,Financial Tips,Saving Money Quotes,Total Money Makeover,Budgeting Finances,Money Management,Wealth Management,Personal Finance,Making Ideas"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/1e/9d/90/1e9d906e4e150e3b95187f3b76ea7c71.png"",""downloaded"":1,""save_location"":""Local save in /data/finance"",""category"":""finance""}"
"{""index"":5069,""unique_id"":""b75b6f87-deb3-444f-b29e-ce9161b2df49"",""title"":""The Vault: Curated & Refined Wedding Inspiration"",""description"":""Sacramento California Wedding 2 Chic Events & Design Jodi Yorston Photography Wilson Vineyards Barn Miosa Couture Yellow Barn Vineyard Outdoor Candles DIY"",""poster_name"":""Style Me Pretty"",""follower_count"":""6M"",""tag_list"":""60th Anniversary Parties,Anniversary Decorations,Golden Anniversary,25th Wedding Anniversary,Anniversary Pictures,Anniversary Ideas,Birthday Decorations,Event Planning Design,Event Design"",""is_image_or_video"":""image"",""image_src"":""https://i.pinimg.com/originals/7e/45/90/7e45905fefa36347e83333fd6d091140.jpg"",""downloaded"":1,""save_location"":""Local save in /data/event-planning"",""category"":""event-planning""}"


In [0]:
display(geo_df)

data
"{""ind"":7528,""timestamp"":""2020-08-28 03:52:47"",""latitude"":-89.9787,""longitude"":-173.293,""country"":""Albania""}"
"{""ind"":2863,""timestamp"":""2020-04-27 13:34:16"",""latitude"":-5.34445,""longitude"":-177.924,""country"":""Armenia""}"
"{""ind"":5730,""timestamp"":""2021-04-19 17:37:03"",""latitude"":-77.015,""longitude"":-101.437,""country"":""Colombia""}"
"{""ind"":8304,""timestamp"":""2019-09-13 04:50:29"",""latitude"":-28.8852,""longitude"":-164.87,""country"":""French Guiana""}"
"{""ind"":8731,""timestamp"":""2020-07-17 04:39:09"",""latitude"":-83.104,""longitude"":-171.302,""country"":""Aruba""}"
"{""ind"":1313,""timestamp"":""2018-06-26 02:39:25"",""latitude"":77.0447,""longitude"":61.9119,""country"":""Maldives""}"
"{""ind"":4315,""timestamp"":""2019-12-15 03:51:28"",""latitude"":-45.8508,""longitude"":66.1003,""country"":""Cote d'Ivoire""}"
"{""ind"":10794,""timestamp"":""2022-01-01 02:26:50"",""latitude"":-89.5236,""longitude"":-154.567,""country"":""Cocos (Keeling) Islands""}"
"{""ind"":5494,""timestamp"":""2021-07-21 02:02:35"",""latitude"":-82.6768,""longitude"":-129.202,""country"":""Bulgaria""}"
"{""ind"":5069,""timestamp"":""2021-03-20 09:32:44"",""latitude"":-63.0063,""longitude"":-157.474,""country"":""Azerbaijan""}"


In [0]:
display(user_df)

data
"{""ind"":7528,""first_name"":""Abigail"",""last_name"":""Ali"",""age"":20,""date_joined"":""2015-10-24 11:23:51""}"
"{""ind"":2863,""first_name"":""Dylan"",""last_name"":""Holmes"",""age"":32,""date_joined"":""2016-10-23 14:06:51""}"
"{""ind"":5730,""first_name"":""Rachel"",""last_name"":""Davis"",""age"":36,""date_joined"":""2015-12-08 20:02:43""}"
"{""ind"":8304,""first_name"":""Charles"",""last_name"":""Berry"",""age"":25,""date_joined"":""2015-12-28 04:21:39""}"
"{""ind"":8731,""first_name"":""Andrea"",""last_name"":""Alexander"",""age"":21,""date_joined"":""2015-11-10 09:27:42""}"
"{""ind"":1313,""first_name"":""Brittany"",""last_name"":""Jones"",""age"":32,""date_joined"":""2016-04-02 03:51:23""}"
"{""ind"":4315,""first_name"":""Michelle"",""last_name"":""Prince"",""age"":36,""date_joined"":""2015-12-20 16:38:13""}"
"{""ind"":10794,""first_name"":""Thomas"",""last_name"":""Turner"",""age"":34,""date_joined"":""2016-12-22 00:02:02""}"
"{""ind"":5494,""first_name"":""Anne"",""last_name"":""Allen"",""age"":27,""date_joined"":""2015-12-16 15:20:05""}"
"{""ind"":5069,""first_name"":""Amanda"",""last_name"":""Ball"",""age"":25,""date_joined"":""2016-01-13 17:36:30""}"


###Schema definitions of pin_data

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType
from pyspark.sql.functions import from_json, map_keys, col

# Specify the JSON schema
pin_schema = StructType([
    StructField("index", IntegerType()),
    StructField("unique_id", StringType()),
    StructField("title", StringType()),
    StructField("description", StringType()),
    StructField("poster_name", StringType()),
    StructField("follower_count", StringType()),
    StructField("tag_list", StringType()),
    StructField("is_image_or_video", StringType()),
    StructField("image_src", StringType()),
    StructField("downloaded", IntegerType()),
    StructField("save_location", StringType()),
    StructField("category", StringType())
])

# Parse the JSON data and select columns needed
pin_parsed_df = pin_df.select(from_json(col("data"), pin_schema).alias('data')) \
                      .selectExpr('data.index as index',
                                  'data.unique_id as unique_id',
                                  'data.title as title',
                                  'data.description as description',
                                  'data.poster_name as poster_name',
                                  'data.follower_count as follower_count',
                                  'data.tag_list as tag_list',
                                  'data.is_image_or_video as is_image_or_video',
                                  'data.image_src as image_src',
                                  'data.downloaded as downloaded',
                                  'data.save_location as save_location',
                                  'data.category as category')

# Show the resulting DataFrame
# display(pin_parsed_df)

###Schema definitions of geo_data

In [0]:
from pyspark.sql.functions import from_json, map_keys, col

# Define the JSON schema as a string
geo_schema = StructType([
    StructField("ind", IntegerType()),
    StructField("timestamp", TimestampType()),
    StructField("latitude", FloatType()),
    StructField("longitude", FloatType()),
    StructField("country", StringType())
])

# Parse the JSON data and select columns needed
geo_parsed_df = geo_df.select(from_json(col("data"), geo_schema).alias('data'))\
                    .selectExpr('data["ind"] as ind',
                                'data["timestamp"] as timestamp',
                                'data["latitude"] as latitude',
                                'data["longitude"] as longitude',
                                'data["country"] as country')

# Show the resulting DataFrame
display(geo_parsed_df)

ind,timestamp,latitude,longitude,country
7528,2020-08-28T03:52:47.000+0000,-89.9787,-173.293,Albania
2863,2020-04-27T13:34:16.000+0000,-5.34445,-177.924,Armenia
5730,2021-04-19T17:37:03.000+0000,-77.015,-101.437,Colombia
8304,2019-09-13T04:50:29.000+0000,-28.8852,-164.87,French Guiana
8731,2020-07-17T04:39:09.000+0000,-83.104,-171.302,Aruba
1313,2018-06-26T02:39:25.000+0000,77.0447,61.9119,Maldives
4315,2019-12-15T03:51:28.000+0000,-45.8508,66.1003,Cote d'Ivoire
10794,2022-01-01T02:26:50.000+0000,-89.5236,-154.567,Cocos (Keeling) Islands
5494,2021-07-21T02:02:35.000+0000,-82.6768,-129.202,Bulgaria
5069,2021-03-20T09:32:44.000+0000,-63.0063,-157.474,Azerbaijan


###Schema definitions of user_data

In [0]:
from pyspark.sql.functions import from_json, map_keys, col

user_schema = StructType([
    StructField("ind", IntegerType()),
    StructField("first_name", StringType()),
    StructField("last_name", StringType()),
    StructField("age", StringType()),
    StructField("date_joined", TimestampType())
])
     
# Parse the JSON data and select columns needed
user_parsed_df = user_df.select(from_json(col("data"), user_schema).alias('data'))\
                    .selectExpr('data["ind"] as ind',
                                'data["first_name"] as first_name',
                                'data["last_name"] as last_name',
                                'data["age"] as age',
                                'data["date_joined"] as date_joined')

# Show the resulting DataFrame
display(user_parsed_df)

ind,first_name,last_name,age,date_joined
7528,Abigail,Ali,20,2015-10-24T11:23:51.000+0000
2863,Dylan,Holmes,32,2016-10-23T14:06:51.000+0000
5730,Rachel,Davis,36,2015-12-08T20:02:43.000+0000
8304,Charles,Berry,25,2015-12-28T04:21:39.000+0000
8731,Andrea,Alexander,21,2015-11-10T09:27:42.000+0000
1313,Brittany,Jones,32,2016-04-02T03:51:23.000+0000
4315,Michelle,Prince,36,2015-12-20T16:38:13.000+0000
10794,Thomas,Turner,34,2016-12-22T00:02:02.000+0000
5494,Anne,Allen,27,2015-12-16T15:20:05.000+0000
5069,Amanda,Ball,25,2016-01-13T17:36:30.000+0000


###Cleaning of pin_df

In [0]:
# pyspark functions
from pyspark.sql.functions import *

# Cleans the DataFrame that contains information about Pinterest posts.
# Creates copy DataFrame for cleaning and drops duplicates
# df_pin_cleaned = pin_parsed_df.dropDuplicates()

def add_nulls_to_dataframe_column(dataframe, column, value_to_replace):
    '''Converts matched values in column of dataframe to null based on expression'''
    dataframe = dataframe.withColumn(column, when(col(column).like(value_to_replace), None).otherwise(col(column)))
    return dataframe


# replace empty entries and entries with no relevant data in each column with Nones
# column names and values to change to null
columns_and_values_for_null = {
    "description": "No description available%",
    "follower_count": "User Info Error",
    "image_src": "Image src error.",
    "poster_name": "User Info Error",
    "tag_list": "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",
    "title": "No Title Data Available"
}
# loop through dictionary, calling function with dictionary values as arguments
for key, value in columns_and_values_for_null.items():
    df_pin_cleaned = add_nulls_to_dataframe_column(df_pin_cleaned, key, value)
# Perform the necessary transformations on the follower_count to ensure every entry is a number
df_pin_cleaned = df_pin_cleaned.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
df_pin_cleaned = df_pin_cleaned.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
# cast follower_count column to integer type
df_pin_cleaned = df_pin_cleaned.withColumn("follower_count", col("follower_count").cast('int'))
# convert save_location column to include only the save location path
df_pin_cleaned = df_pin_cleaned.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))
# rename the index column to ind
df_pin_cleaned = df_pin_cleaned.withColumnRenamed("index", "ind")
# reorder columns
new_pin_column_order = [
    "ind",
    "unique_id",
    "title",
    "description",
    "follower_count",
    "poster_name",
    "tag_list",
    "is_image_or_video",
    "image_src",
    "save_location",
    "category"
]
df_pin_cleaned = df_pin_cleaned.select(new_pin_column_order)
display(df_pin_cleaned)

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
1864,6f1951f0-63be-4c4f-8d21-e4995217f69e,120 Christmas Decorations from the Dollar Store,Love Christmas decorations but hate spending a lot bunch of money? Check out some of these budget DIY decorations you can easily make from the dollar store!,42000.0,Caroline|CarolineVencil.com | Saving & Making Money | Pro Blogger,"Diy Snowman Decorations,Christmas Candle Decorations,Diy Christmas Ornaments,Christmas Ideas,Christmas Christmas,Snowman Ornaments,Christmas Diy Gifts,Vase Decorations,Diy Christmas Decorations For Home",image,https://i.pinimg.com/originals/30/85/21/3085215db77e55770202724268465490.jpg,/data/christmas,christmas
8304,5b6d0913-25e4-43ab-839d-85d5516f78a4,The #1 Reason You’re Not His Priority Anymore - Matthew Coast,#lovequotes #matchmaker #matchmadeinheaven #loveyourself #respectyourself,51000.0,Commitment Connection,"Wise Quotes,Quotable Quotes,Words Quotes,Wise Words,Quotes To Live By,Great Quotes,Motivational Quotes,Inspirational Quotes,Funny Quotes",image,https://i.pinimg.com/originals/c6/64/ee/c664ee71524fb5a6e7b7b49233f93b43.png,/data/quotes,quotes
3201,6370f096-f344-49c3-a6b1-3072c537a953,DIY Pom Pom Letters for MOM!,Marla Meridith - DIY Pom Pom Letters for MOM!,671000.0,Marla Meridith,"Kids Crafts,Crafts To Make,Craft Projects,Arts And Crafts,Craft Ideas,Pom Pom Crafts,Yarn Crafts,Flower Crafts,Diy Flower",image,https://i.pinimg.com/originals/db/c6/98/dbc69818c531e28ee514aaaec377257d.jpg,/data/diy-and-crafts,diy-and-crafts
1699,e930ea57-d34a-499f-9811-126d39ed1fee,Easy to Make Mason Jar Christmas Scenes,ow to make easy and inexpensive Christmas decor with these cute mason jar Christmas scenes. Who doesn't love mason jar crafts for Christmas?,142000.0,Twelve On Main,"Christmas Decorations Diy Crafts,Christmas Crafts For Gifts,Diy Decoration,Diy Ornaments,Decor Ideas,Gift Ideas,Decorating Ideas,Diy Christmas Room Decor,Diy Christmas Projects",image,https://i.pinimg.com/originals/0c/31/a1/0c31a189ab7e503c035c8af991d5bd29.jpg,/data/christmas,christmas
6145,82e13a07-db99-43a3-b1c0-89a4b75821da,HOLIDAY MANTLE DECOR - @AMAZON & @TARGET FINDS,"Holiday mantle decor, Christmas decor, metallic mercury glass style Christmas trees, eucalyptus vine, evergreen pine branches, white neutral holiday decor, cozy mantle for the h…",83000.0,Stylin by Aylin,"Winter Home Decor,Christmas Living Room Decor,Living Room Decor Cozy,Christmas Decor,Cozy Fireplace,Rustic Fireplace Decor,Fireplace Decorations,Rustic Room,House Decorations",image,https://i.pinimg.com/originals/9d/82/1a/9d821a80acd8f90c16454e978bd9b115.jpg,/data/home-decor,home-decor
5742,44fc133d-0d79-4f00-a803-d6aa5fc8c31b,How To Find The Best Stocks,How to find Great Stocks #Finance #Investing #DaveRamsey #Debt #PassiveIncome #PersonalFinance #Trading #Stocks,19000.0,Financial Toolkit,"Stock Finance,Finance Tracker,Finance Tips,Trade Finance,Dave Ramsey,Homepage Layout,Analyse Technique,Value Stocks,Bollinger Bands",image,https://i.pinimg.com/originals/0d/d2/8e/0dd28e384ea880145f1445a22e10fac5.png,/data/finance,finance
7832,a549938a-786e-4a94-a473-403e8e3a15ad,You either win or you learn❤️ #inspirationalquote #quote #positive #hope,,3000.0,LISA MASLYK / VOICE OVER TALENT /ACTOR,"Motivacional Quotes,Doodle Quotes,Wisdom Quotes,Words Quotes,Year Quotes,Positive Quotes For Life,Good Life Quotes,Self Love Quotes,Self Healing Quotes",multi-video(story page format),https://i.pinimg.com/videos/thumbnails/originals/2b/20/ac/2b20ace41dfc29087005a3df9fe1dd6d.0000001.jpg,/data/quotes,quotes
7166,4a844b03-e161-47a1-904b-591eb5dc4fb1,The Killers - Mr. Brightside - Women's T-Shirt - Heather Dark Grey / S,"Women's T-shirt. Design inspired by the rock band The Killers' hit ""Mr. Brightside"". One of the greatest song from the album Hot Fuss released in 2004. Soft and light, 100% cott…",27.0,Mala Rock | Rock T-shirts,"Mr Brightside,Rock T Shirts,Greatest Songs,Timeless Classic,Rock Bands,Album,T Shirts For Women,Inspired,Hot",image,https://i.pinimg.com/originals/8c/42/39/8c42391d35fcad51a4a79f7cd81bf26d.jpg,/data/mens-fashion,mens-fashion
7234,c05f564d-2783-4bec-b205-d3f756276296,16 Amazing Casual Outfit Grids For Guys,Your spring summer wardrobe inspiration..,613000.0,Mens Fashion - LIFESTYLE BY PS,"Mode Outfits,Casual Outfits,Men Casual,Fashion Outfits,Fashion Clothes,Hijab Casual,Gentleman Mode,Gentleman Style,Mode Masculine",image,https://i.pinimg.com/originals/0e/cf/eb/0ecfeb441e7a3559aa41f11b94cdd6ef.jpg,/data/mens-fashion,mens-fashion
1599,aaf8f442-fed5-494f-a84b-64c5e8dc9fea,Bb Beauty,27 DIY Beauty Hacks,213.0,Karen Red,"Bb Beauty,Beauty Care,Beauty Makeup,Fashion Beauty,Hair Beauty,Natural Beauty,Beauty Skin,Diy Beauty Hacks,Beauty Hacks For Teens",image,https://i.pinimg.com/originals/2c/6b/2a/2c6b2a79ce32ad125806b9a42cd00555.jpg,/data/beauty,beauty


In [0]:
df_pin_cleaned.printSchema()

### Ceaning of geo_df

In [0]:
# pyspark functions
from pyspark.sql.functions import *


# Cleans the DataFrame that contains information about geolocation data.
# Creates copy DataFrame for cleaning and drops duplicates
df_geo_cleaned = geo_parsed_df.dropDuplicates()

# Creates a new column "coordinates" that contains an array based on the latitude and longitude columns
df_geo_cleaned = df_geo_cleaned.withColumn("coordinates", array("latitude", "longitude")).drop("latitude", "longitude")

# Convert timestamp colmn from string to timestamp type
df_geo_cleaned = df_geo_cleaned.withColumn("timestamp", to_timestamp("timestamp"))

# Reorder columns
df_geo_cleaned = df_geo_cleaned.select("ind", "country", "coordinates", "timestamp")

# Displays the cleaned DataFrame
display(df_geo_cleaned)

ind,country,coordinates,timestamp
9936,Australia,"List(-82.1715, -147.691)",2018-06-30T22:07:13.000+0000
159,Andorra,"List(-88.0812, -166.603)",2017-11-20T21:14:56.000+0000
2060,Trinidad and Tobago,"List(52.4584, 68.6527)",2020-01-25T13:54:17.000+0000
7510,Aruba,"List(-79.9828, -172.235)",2021-12-21T01:37:25.000+0000
2074,Central African Republic,"List(-52.3213, -50.11)",2019-11-03T05:41:59.000+0000
4076,Mauritania,"List(-67.2157, 27.8139)",2019-06-07T20:13:50.000+0000
9979,Dominican Republic,"List(14.9967, -120.682)",2018-07-18T19:01:46.000+0000
4137,Australia,"List(-55.3079, 108.918)",2019-06-23T22:16:34.000+0000
1014,Tonga,"List(81.1477, 69.3538)",2022-02-03T22:05:34.000+0000
3729,Turkey,"List(-24.4793, -150.145)",2018-01-17T03:43:38.000+0000


### Ceaning of user_df

In [0]:
# pyspark functions
from pyspark.sql.functions import *


# Cleans the DataFrame that contains information about user data.
# Creates copy DataFrame for cleaning and drops duplicates
df_user_cleaned = user_parsed_df.dropDuplicates()

# Concatenates first_name and last_name into a new column
df_user_cleaned = df_user_cleaned.withColumn("user_name", concat("first_name", lit(" "), "last_name")).drop("first_name", "last_name")

# Converts date_joined column from a string to a timestamp data type
df_user_cleaned = df_user_cleaned.withColumn("date_joined", to_timestamp("date_joined"))

# Reorders columns 
df_user_cleaned = df_user_cleaned.select("ind", "user_name", "age", "date_joined")


# Displays the cleaned DataFrame
display(df_user_cleaned)

ind,user_name,age,date_joined
5494,Anne Allen,27,2015-12-16T15:20:05.000+0000
1371,Amanda Brown,20,2015-11-12T03:23:49.000+0000
719,Alicia Avila,20,2016-01-26T01:49:23.000+0000
10552,Michael Hunter,40,2017-05-16T07:09:21.000+0000
1014,Gregory Ramsey,56,2016-10-15T17:21:39.000+0000
8978,Mark Jones,34,2017-08-22T00:01:29.000+0000
3800,Adam Armstrong,20,2015-12-17T08:43:40.000+0000
9979,Kaylee Miller,31,2016-11-09T19:50:51.000+0000
5468,Lisa Gamble,20,2016-07-23T20:51:06.000+0000
8304,Charles Berry,25,2015-12-28T04:21:39.000+0000


In [0]:
# saves df_pin_cleaned to delta tables
df_pin_cleaned.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("0ecac53030fd_pin_table")


In [0]:
# saves df_geo_cleaned to delta tables
df_geo_cleaned.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("0ecac53030fd_geo_table")

In [0]:
# saves df_user_cleaned to delta tables
df_user_cleaned.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("0ecac53030fd_user_table")


In [0]:
# deletes the checkpoint folder 
dbutils.fs.rm("/tmp/kinesis/_checkpoints/", True)