In [13]:
"""
Author: Matt Martin
Date: 2/8/24
Desc: Demo using spark to transform data
"""

## create the spark connection/instance
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

In [22]:
## generate some dummy data and write to parquet
#%pip install Faker

from faker import Faker
fake = Faker()
data = []
for i in range(0,1001):
    data.append({'first_name':fake.first_name(), 'last_name':fake.last_name(),'street_adrs':fake.street_address()
                 ,'zip_cd':fake.zipcode(), 'state_abrv':fake.state_abbr()})

df = spark.createDataFrame(data)

df.write.mode('overwrite').parquet('./dummy_data/')

In [2]:
#task: Combine first and last name to one field and combine the address to one field separated by a dash

df1 = spark.read.parquet('./dummy_data')
df1.createOrReplaceTempView('people')


In [4]:
from pyspark.sql import functions as F


In [11]:
#using dataframe syntax to transform
dft = df1.select(
    F.concat(F.col("first_name"), F.lit(' '), F.col("last_name")).alias("full_name"),
    F.concat(F.col("street_adrs"), F.lit('-'), F.col("zip_cd"), F.lit('-'), F.col("state_abrv")).alias("full_adrs")
)

In [None]:
## transform and write out
#using sql
sql = """
    select concat(first_name, ' ',last_name) as full_name
        ,concat(street_adrs,'-',zip_cd,'-',state_abrv) as full_adrs
    from people
"""
dft = spark.sql(sql)
dft.show(5, truncate=False)

In [6]:
dft.write.mode('overwrite').parquet('./people_tsfm/')

In [23]:
spark.stop()