In [0]:
import pyspark
from pyspark.sql import functions as sf
from pyspark.sql import types as st

In [0]:
possible_source_timestamp_formats = [
    "yyyy-MM-dd HH:mm:ss",
    "yyyy-MM-dd'T'HH:mm:ss",
    "yyyy-MM-dd'T'HH:mm:ss.SSS",
    "MM-dd-yyyy HH:mm:ss",
    "MM/dd/yyyy HH:mm:ss",
    "dd-MM-yyyy HH:mm:ss",
    "M/d/yyyy h:mm:ss a",
    "MMMM d, yyyy h:mm:ss a",
]

def parse_timestamp_expr(col_name: str) -> pyspark.sql.Column:
    """
    Attempts to parse a timestamp column using multiple possible timestamp formats.
    Trims whitespace and tries each format in order, returning the first successfully parsed TimestampType.
    If none match, returns null.
    """
    ts_exprs = [
        sf.try_to_timestamp(
            sf.trim(sf.col(col_name)),
            sf.lit(possible_source_timestamp_format)
        )
        for possible_source_timestamp_format in possible_source_timestamp_formats
    ]
    return sf.coalesce(*ts_exprs)

## Test
def test_parse_timestamp_expr():
    test_data = [
        ("2024-06-01 14:30:00",),
        ("2024-06-01T14:30:00",),
        ("06-01-2024 02:30:00",),
        ("06/01/2024 02:30:00",),
        ("June 1, 2024 2:30:00 PM",),
        ("InvalidTS",),
        (None,),
        ("   ",)
    ]
    test_df = spark.createDataFrame(test_data, ["SalesTimestamp"])
    test_df = test_df.withColumn(
        "ParsedTimestamp",
        parse_timestamp_expr("SalesTimestamp")
    )
    display(test_df)

In [0]:
possible_source_date_formats = [
    "yyyy-MM-dd",
    "MM-dd-yyyy",
    "MM/dd/yyyy",
    "dd-MM-yyyy",
    "M/d/yyyy",
    "MMMM d, yyyy"
]
def parse_date_expr(col_name: str) -> pyspark.sql.Column:
    """
    Attempts to parse a date column using multiple possible source date formats.
    Trims whitespace from the column, tries each format in order, and returns the first successfully parsed date in yyyy-MM-dd format.
    If none of the formats match, returns null for that row.

    Args:
        col_name (str): The name of the column containing date strings.

    Returns:
        pyspark.sql.Column: A column expression with values cast to DateType, or null if parsing fails.
    """
    date_expr = [
        sf.try_to_date(
            sf.trim(
                sf.col(col_name)
            ),
            possible_source_date_format
        ) 
        for possible_source_date_format in possible_source_date_formats
    ]
    return sf.coalesce(*date_expr)

## Test
def test_parse_date_expr():
    test_data = [
        ("2024-06-01",),
        ("12-21-2025",),
        ("06-01-2024",),
        ("01-06-2024",),
        ("06/01/2024",),
        ("invalid",),
        (" 2024-06-01 ",),
        (None,),
        ("",)
    ]
    test_df = spark.createDataFrame(test_data, ["SalesDate"])
    test_df = test_df.withColumn("ParsedDate", parse_date_expr("SalesDate"))
    display(test_df)

In [0]:
def cast_int_expr(col_name: str) -> pyspark.sql.Column:
    """
    Cleans and safely casts a string column to IntegerType.

    Removes commas and whitespace from the specified column, then attempts to cast the cleaned value to IntegerType.
    If the value is null, returns the original value.

    Args:
        col_name (str): The name of the column containing integer-like strings.

    Returns:
        pyspark.sql.Column: A column expression with values cast to IntegerType, or the original value if null.
    """
    intermediate_col = sf.regexp_replace(sf.col(col_name), r"[,\s]", "")
    intermediate_col = sf.regexp_replace(intermediate_col, r"\.\d*$", "")
    
    cleansed_col = sf.when(
        sf.col(col_name).isNotNull(),
        intermediate_col
    ).otherwise(sf.col(col_name))

    return cleansed_col.try_cast(st.IntegerType())

## Test
def test_cast_int_expr():
    test_df = spark.createDataFrame(
        [("1,234",), (" 56 78 ",), ("100.0",), ("",), (None,)],
        ["raw"]
    )

    test_df = test_df.withColumn("clean_int", cast_int_expr("raw"))
    display(test_df)
# test_cast_int_expr()

In [0]:
def cast_double_expr(col_name: str) -> pyspark.sql.Column:
    """
    Cleans and safely casts a string column to DoubleType.

    Removes commas and whitespace from the specified column, then attempts to cast the cleaned value to DoubleType.
    If the value is null, returns the original value.

    Args:
        col_name (str): The name of the column containing double-like strings.

    Returns:
        pyspark.sql.Column: A column expression with values cast to DoubleType, or the original value if null.
    """
    cleansed_col = sf.when(
        sf.col(col_name).isNotNull(),
        sf.regexp_replace(
            sf.col(col_name),
            r"[,\s]",
            ""
        )
    ).otherwise(sf.col(col_name))

    return cleansed_col.try_cast(st.DoubleType())

## Test
def test_cast_double_expr():
    test_df = spark.createDataFrame(
        [("1,234.50",), (" 56 78 . 24 ",), ("100.20",) ,(" ",), (None,)],
        ["raw"]
    )

    test_df = test_df.withColumn("clean_double", cast_double_expr("raw"))
    display(test_df)