In [68]:
import os

# ensure that these variables are set in your env, either through your shell config file or venv activation script
print(os.environ.get('JAVA_HOME')) #"/opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home"
print(os.environ.get('SPARK_HOME')) # /Users/lydialim/pyspark-eda-demo/.venv/lib/python3.10/site-packages/pyspark

/opt/homebrew/opt/openjdk@17/libexec/openjdk.jdk/Contents/Home
/Users/lydialim/pyspark-eda-demo/.venv/lib/python3.10/site-packages/pyspark


In [69]:
from pyspark.sql import SparkSession

# initialize spark session
spark = SparkSession.builder.getOrCreate()

In [55]:
# import libs
from pyspark.sql.functions import (
    concat,
    col,
    length,
    lit,
    when,
    from_unixtime,
    from_utc_timestamp,
    to_timestamp,
    date_format,
)



In [70]:
# additional spark session configurations
spark.conf.set("spark.sql.session.timeZone", "Asia/Singapore")  # UTC+8


In [71]:
df = spark.read.parquet("data_fixtures/cc_sample_transaction.parquet")
df.printSchema()
df.show(5)

root
 |-- trans_date_trans_time: string (nullable = true)
 |-- cc_num: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: string (nullable = true)
 |-- person_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- merch_lat: string (nullable = true)
 |-- merch_long: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- merch_zipcode: string (nullable = true)
 |-- merch_last_update_time: string (nullable = true)
 |-- merch_eff_time: string (nullable = true)
 |-- cc_bic: string (nullable = true)

+---------------------+-----------

In [72]:
def to_utc8_hm(c):
    """
    Accepts Column c whose values may be:
      • purely numeric, ≥15 digits → µs since epoch
      • purely numeric, 12–14 digits → ms since epoch
      • anything else → literal 'yyyy-MM-dd HH:mm:ss'
    Returns a Column of strings
      'yyyy-MM-dd HH:mm:ss.SSSSSS +08:00'
    """
    c_str = c.cast("string")
    is_digits = c_str.rlike("^[0-9]+$")

    # 1) parse into a UTC Timestamp
    ts_utc = (
        when(is_digits & (length(c_str) >= 15),
             from_unixtime(c_str.cast("double") / 1e6))      # µs → seconds
        .when(is_digits & (length(c_str).between(12, 14)),
             from_unixtime(c_str.cast("double") / 1e3))      # ms → seconds
        .otherwise(
             to_timestamp(c_str, "yyyy-MM-dd HH:mm:ss")      # literal parse
        )
    )

    # 2) shift to UTC+8
    ts_utc8 = from_utc_timestamp(ts_utc, "GMT+8")

    # 3) format with 6-digit fraction and append "+08:00"
    return concat(
        date_format(ts_utc8, "yyyy-MM-dd HH:mm:ss.SSSSSS"),
        lit(" +08:00")
    )




In [73]:

timestamp_cols = ["merch_eff_time", "merch_last_update_time", "trans_date_trans_time"]

# overwrites the existing timestamp columns inplace
for ts in timestamp_cols:
    df = df.withColumn(
        ts,
        to_utc8_hm(col(ts))
    )

df.show(truncate=False)

+---------------------------------+-------------------+----------------------------------------+-------------+------+-------------------------+------+------------------------------+------------------------+-----+-----+-------+------------------+--------+---------------------------------------------+----------+--------------------------------+------------------+------------------+--------+-------------+---------------------------------+---------------------------------+-----------+
|trans_date_trans_time            |cc_num             |merchant                                |category     |amt   |person_name              |gender|street                        |city                    |state|zip  |lat    |long              |city_pop|job                                          |dob       |trans_num                       |merch_lat         |merch_long        |is_fraud|merch_zipcode|merch_last_update_time           |merch_eff_time                   |cc_bic     |
+---------------------------

In [None]:
# try to see if the BIC can be used
unique_values = df.select("cc_bic").distinct().collect()
for row in unique_values:
    print(row.cc_bic)

ADMDUS41
ACEEUS31
DEUTUS33TRF
NA
CITIUS33CHI
AIABUS31

Null
APBCUS61


+-----------+
|     cc_bic|
+-----------+
|   ADMDUS41|
|   ACEEUS31|
|DEUTUS33TRF|
|         NA|
|CITIUS33CHI|
|   AIABUS31|
|           |
|       Null|
|   APBCUS61|
+-----------+

