# This notebook showcases how to use PySpark's built-in functions for data hashing and masking.

# Masks all characters in the specified column.
This method is used for dataframe with column which consists of PII data which needs to masks all characters in the specified column with masking character.

```
 Args:
        dataframe: Input Spark DataFrame.
        column_name: Name of the column to mask.
        output_column: Output column name. Default: {column_name}_masked.
        mask_char: Character to use for masking (e.g., "*").        
    Returns:
        DataFrame with masked column.
```
**Typical Use Cases**

- **Structured Data:**
Can be applied to columns which needs to be masked.

**How It Works**
- **Masks all characters**:
The method masks all characters for the specified column.

In [1]:
from pyspark.sql.functions import regexp_replace, col, lit

def complete_mask_column(
    dataframe, 
    column_name, 
    output_column=None, 
    mask_char="*"
):

    if output_column is None:
        output_column = f"{column_name}_masked"
    return dataframe.withColumn(
        output_column,
        regexp_replace(col(column_name), ".", mask_char)
    )

StatementMeta(, 835a4cbf-3966-43fb-90cd-d858a300e968, 3, Finished, Available, Finished)

# Usage Example

In [2]:
# Replace this path with your data file location.
df = spark.read.format("csv").option("header","true").load("Files/data/customer-profile-sample-data/customers-sampledata.csv")
df = complete_mask_column(df, "FirstName", mask_char="#")
display(df.select("FirstName", "FirstName_masked").show())

StatementMeta(, 835a4cbf-3966-43fb-90cd-d858a300e968, 4, Finished, Available, Finished)

+---------+----------------+
|FirstName|FirstName_masked|
+---------+----------------+
|    David|           #####|
|   Sharon|          ######|
|    Barry|           #####|
|  Kenneth|         #######|
|   Justin|          ######|
|    Janet|           #####|
|    Brett|           #####|
|   Denise|          ######|
|   Joshua|          ######|
|  Timothy|         #######|
| Samantha|        ########|
|     Tony|            ####|
|   Pamela|          ######|
|  Kenneth|         #######|
|    Julie|           #####|
|    Shawn|           #####|
|  Michael|         #######|
|   Austin|          ######|
|    Ethan|           #####|
|    Jason|           #####|
+---------+----------------+



# Masks partial characters(from start or end) in the specified column.
This method is used for dataframe with column which consists of PII data which needs to masks partial characters in the specified column with masking character from start or end.

```
 Args:
        dataframe: Input Spark DataFrame.
        column_name: Name of the column to mask.
        output_column: Output column name. Default: {column_name}_masked.
        mask_char: Character to use for masking.
        mask_from_first: If True, mask from the start. If False, mask from the end.
        num_chars: Number of characters to mask.
        
    Returns:
        DataFrame with masked column.
```
**Typical Use Cases**

- **Structured Data:**
Can be applied to columns which needs to be masked.

**How It Works**
- **Masks partial characters from start or end**:
The method masks partial characters for the specified column from start or end.

In [7]:
from pyspark.sql.functions import pandas_udf, col, lit
from pyspark.sql.types import StringType
import pandas as pd

def partial_mask_column(
    dataframe, 
    column_name, 
    output_column=None, 
    mask_char="*", 
    mask_from_first=True, 
    num_chars=0
):
    if output_column is None:
        output_column = f"{column_name}_masked"
    return dataframe.withColumn(
        output_column,
        partial_mask_pandas_udf(
            col(column_name),
            lit(mask_char),
            lit(mask_from_first),
            lit(num_chars)
        )
    )

StatementMeta(, c65adddb-3880-4c6b-9bc9-1d161df36028, 8, Finished, Available, Finished)

# PySpark Pandas UDF (partial_mask_pandas_udf)

This code defines a **PySpark Pandas UDF (partial_mask_pandas_udf)** to masks partial characters in the specified column with masking character from start or end.

In [8]:
@pandas_udf(StringType())
def partial_mask_pandas_udf(value: pd.Series, mask_char: pd.Series, mask_from_first: pd.Series, num_chars: pd.Series) -> pd.Series:
    def mask_single(v, m, mf, n):
        if pd.isnull(v):
            return None
        v = str(v)
        length = len(v)
        num_to_mask = min(n, length)
        if mf:
            return m * num_to_mask + v[num_to_mask:]
        else:
            return v[:-num_to_mask] + m * num_to_mask
    return pd.Series([mask_single(v, m, mf, n) for v, m, mf, n in zip(value, mask_char, mask_from_first, num_chars)])

StatementMeta(, c65adddb-3880-4c6b-9bc9-1d161df36028, 9, Finished, Available, Finished)

# Usage Example

In [9]:
# Replace this path with your data file location.
df = spark.read.format("csv").option("header","true").load("Files/data/customer-profile-sample-data/customers-sampledata.csv")
df = partial_mask_column(df, "DateOfBirth", mask_char="X", mask_from_first=True, num_chars=4)
display(df.select("DateOfBirth", "DateOfBirth_masked"))

StatementMeta(, c65adddb-3880-4c6b-9bc9-1d161df36028, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a017a39c-81ce-45fc-b47a-9143f7ab8a12)

# SHA-256 hash of the specified column
This function uses PySpark's built-in sha2 function to compute the SHA-256 hash (hexadecimal string) for each value in the given column. The resulting hash is stored in a new column named '<column_name>_sha256'.
```
    Parameters
    ----------
    dataframe : pyspark.sql.DataFrame
        The input DataFrame containing the column to hash.
    column_name : str
        The name of the column to hash using SHA-256.
    output_column: str
        Output column name. Default: {column_name}_sha256.
    
    Returns
    -------
    pyspark.sql.DataFrame
        A new DataFrame with an additional column '<column_name>_sha256' containing the SHA-256 hashes.
```

In [11]:
from pyspark.sql.functions import sha2, concat, lit

def hash_column_withoutsalt(dataframe, column_name, output_column=None
):
    if output_column is None:
        output_column = f"{column_name}_sha256"
    return dataframe.withColumn(
        output_column,sha2(column_name, 256)
    )

StatementMeta(, c65adddb-3880-4c6b-9bc9-1d161df36028, 12, Finished, Available, Finished)

# Usage

In [13]:
# Replace this path with your data file location.
df = spark.read.format("csv").option("header","true").load("Files/data/customer-profile-sample-data/customers-sampledata.csv")
df = hash_column_withoutsalt(df, "CustomerID", "CustomerID_sha256")
display(df.select("CustomerID", "CustomerID_sha256"))

StatementMeta(, c65adddb-3880-4c6b-9bc9-1d161df36028, 14, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c6c98305-d53d-42b5-a8c7-be92d571ed45)

# SHA-256 hash with Salt key for the specified column
This function uses PySpark's built-in sha2 function to compute the SHA-256 hash (hexadecimal string) for each value in the given column. The resulting hash is stored in a new column named '<column_name>_sha256'.
```
    Parameters
    ----------
    dataframe : pyspark.sql.DataFrame
        The input DataFrame containing the column to hash.
    column_name : str
        The name of the column to hash using SHA-256.
    output_column: str
        Output column name. Default: {column_name}_sha256.
    salt_key: str
        The salt to append to each value before hashing.
    Returns
    -------
    pyspark.sql.DataFrame
        A new DataFrame with an additional column '<column_name>_sha256' containing the SHA-256 hashes.
```

In [14]:
def hash_column_withsalt(dataframe, column_name, salt_key,output_column=None):
    """
    Hashes the specified column using SHA-256, with the salt appended.
    """
    if output_column is None:
        output_column = f"{column_name}_sha256"
    return dataframe.withColumn(
        f"{column_name}_sha256_salt",
        sha2(concat(dataframe[column_name], lit(salt_key)), 256)
    )

StatementMeta(, c65adddb-3880-4c6b-9bc9-1d161df36028, 15, Finished, Available, Finished)

In [17]:
# Replace this path with your data file location.
df = spark.read.format("csv").option("header","true").load("Files/data/customer-profile-sample-data/customers-sampledata.csv")
df = hash_column_withsalt(df, "CustomerID","mySalt", "CustomerID_salt_sha256")
display(df.select("CustomerID", "CustomerID_sha256_salt"))

StatementMeta(, c65adddb-3880-4c6b-9bc9-1d161df36028, 18, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5eebe92c-b8ee-449e-af22-e62b9bb213c8)