In [0]:
import os
import gzip
import shutil
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, StringType, BooleanType
def decompress_gzip_file(source_path, output_folder):
    """
    Decompresses a gzipped file to a specified output folder
    Args:
        source_path (str): Path to the gzipped file in Unity Catalog volume
        output_folder (str): Destination folder for decompressed file
    Returns:
        tuple: (success_status, output_path_or_error_message)
    """
    try:
        # Validate input parameters
        if not source_path or not output_folder:
            return False, "Error: Source path or output folder is empty"
        # Check if source file exists
        if not os.path.exists(source_path):
            return False, f"Error: Source file does not exist: {source_path}"
        # Check if file has .gz extension
        if not source_path.lower().endswith(('.gz', '.gzip')):
            return False, f"Error: File is not a gzip file (missing .gz/.gzip extension): {source_path}"
        # Create output directory if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)
        # Generate output filename (remove .gz extension)
        base_filename = os.path.basename(source_path)
        if base_filename.endswith('.gz'):
            output_filename = base_filename[:-3]  # Remove .gz
        elif base_filename.endswith('.gzip'):
            output_filename = base_filename[:-5]  # Remove .gzip
        else:
            output_filename = base_filename + "_decompressed"
        output_path = os.path.join(output_folder, output_filename)
        # Decompress the file using chunked reading for large files
        try:
            with gzip.open(source_path, 'rb') as gz_file:
                with open(output_path, 'wb') as output_file:
                    # Use shutil.copyfileobj for efficient chunked copying
                    shutil.copyfileobj(gz_file, output_file, length=1024*1024)  # 1MB chunks
            # Verify the decompressed file was created and has content
            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
                return True, output_path
            else:
                return False, "Error: Decompressed file is empty or was not created"
        except gzip.BadGzipFile:
            return False, f"Error: File is not a valid gzip file: {source_path}"
        except OSError as e:
            return False, f"Error: Failed to read gzip file - {str(e)}"
        except Exception as e:
            return False, f"Error during decompression: {str(e)}"
    except PermissionError:
        return False, f"Error: Permission denied accessing {source_path} or {output_folder}"
    except Exception as e:
        return False, f"Error: Unexpected error - {str(e)}"
# Define the return schema for the UDF
decompress_result_schema = StructType([
    StructField("success", BooleanType(), False),
    StructField("message", StringType(), False)
])
# Create the UDF with proper Arrow optimization
@udf(returnType=decompress_result_schema)
def decompress_gzip_udf(source_path, output_folder):
    """
    Spark UDF wrapper for the decompress function
    Args:
        source_path (str): Path to the gzipped file
        output_folder (str): Destination folder for decompressed file
    Returns:
        struct: Contains success boolean and message (output path or error)
    """
    success, message = decompress_gzip_file(source_path, output_folder)
    return (success, message)
# Alternative version using Arrow-optimized pandas UDF for better performance
from pyspark.sql.functions import pandas_udf
import pandas as pd
@pandas_udf(returnType=decompress_result_schema)
def decompress_gzip_arrow_udf(source_paths: pd.Series, output_folders: pd.Series) -> pd.DataFrame:
    """
    Arrow-optimized pandas UDF for batch processing of gzip decompression
    Args:
        source_paths: Pandas Series of source file paths
        output_folders: Pandas Series of output folder paths
    Returns:
        DataFrame: Contains success and message columns
    """
    results = []
    for source_path, output_folder in zip(source_paths, output_folders):
        success, message = decompress_gzip_file(source_path, output_folder)
        results.append({'success': success, 'message': message})
    return pd.DataFrame(results)
# Example usage with your DataFrame
def apply_gzip_decompression(df, source_column, output_folder_path):
    """
    Apply the decompression UDF to a DataFrame
    Args:
        df: Spark DataFrame with file paths
        source_column: Name of column containing gzip file paths
        output_folder_path: Path where decompressed files should be stored
    Returns:
        DataFrame: Original DataFrame with added result struct column
    """
    # Using regular UDF
    result_df = df.withColumn(
        "decompress_result",
        decompress_gzip_udf(
            col(source_column),
            lit(output_folder_path)
        )
    )
    # Extract success and message from struct for easier access
    final_df = result_df.select(
        "*",
        col("decompress_result.success").alias("decompress_success"),
        col("decompress_result.message").alias("decompress_message")
    ).drop("decompress_result")
    return final_df
# Alternative usage with Arrow-optimized UDF
def apply_gzip_decompression_arrow(df, source_column, output_folder_path):
    """
    Apply the Arrow-optimized decompression UDF to a DataFrame
    """
    from pyspark.sql.functions import lit
    result_df = df.withColumn(
        "decompress_result",
        decompress_gzip_arrow_udf(
            col(source_column),
            lit(output_folder_path)
        )
    )
    # Extract success and message from struct
    final_df = result_df.select(
        "*",
        col("decompress_result.success").alias("decompress_success"),
        col("decompress_result.message").alias("decompress_message")
    ).drop("decompress_result")
    return final_df
# Example usage:
"""
# Assuming you have a DataFrame with a column 'file_paths' containing gzipped file locations
df = spark.createDataFrame([
    ("/Volumes/catalog/schema/volume/data1.gz",),
    ("/Volumes/catalog/schema/volume/data2.gz",),
    ("/Volumes/catalog/schema/volume/invalid.txt",),
], ["file_paths"])
# Apply decompression
output_folder = "/Volumes/catalog/schema/volume/decompressed/"
result_df = apply_gzip_decompression_arrow(df, "file_paths", output_folder)
# Show results
result_df.select("file_paths", "decompress_success", "decompress_message").show(truncate=False)
"""